使用Selenium和Python解析从搜索查询中获取的LinkedIn个人资料页面。

huangapple go评论63阅读模式
英文:

Parsing Linked in Profiles yielded from search query using using selenium and python

问题

目标

  • 我正在创建一个用于网页抓取LinkedIn的Python程序,它将接受用户登录凭据和自定义搜索查询作为输入,然后使用Selenium导航到生成的配置文件,并从特定的网页元素中提取数据并将其存储在Pandas数据框中。

问题

  • 我已经提供了我的代码。我主要关心的是它需要很长的时间才能成功完成运行(大约需要23分钟来解析68个配置文件)。有人可以帮助我优化代码以提高速度吗?谢谢!

代码

#导入模块

userid = "userid@domain.com"
password = "p@Ssw0rd!"
keyword = "Master of Business Data Science Otago"
url = f"https://www.linkedin.com/search/results/people/?keywords={keyword}&origin=SWITCH_SEARCH_VERTICAL&sid=RZW"

driver = webdriver.Chrome()

driver.get("https://www.linkedin.com")
driver.implicitly_wait(6)
driver.find_element(By.XPATH, "//*[@id='session_key']").send_keys(userid)
driver.find_element(By.XPATH, "//*[@id='session_password']").send_keys(password)
driver.find_element(By.XPATH, "//button[@class='sign-in-form__submit-button']").click()

driver.get(url)
links = []
scroll_target = driver.find_element(By.CLASS_NAME, "background-mercado") # 滚动页面到底部的LinkedIn标志以使'下一页'按钮元素可见
driver.execute_script('arguments[0].scrollIntoView(true)', scroll_target)

while True:
    try:
        time.sleep(3)
        linky = driver.find_elements(By.CLASS_NAME, 'app-aware-link ') # 定位页面中链接的容器
        links.append([li.get_attribute('href') for li in linky[::2] if 'miniProfileUrn' in str(li.get_attribute('href'))]) # 从所有链接中筛选出配置文件链接
        page_button = driver.find_element(By.XPATH, '//button[@aria-label="Next"]') # 定位'下一页'按钮,以便单击到下一页的结果
        page_button.click()
    except:
        print("没有更多页面")

# 定位和解析捕获的每个配置文件中的各个元素
elements = {'name': "//h1[@class='text-heading-xlarge inline t-24 v-align-middle break-words']",
            'prefix': "//div[@class='text-body-small v-align-middle break-words t-black--light']",
            'title': "//div[@class='text-body-medium break-words']",
            'location': "//div[@class='text-body-small inline t-black--light break-words']",
            'see_more': "//button[@class='inline-show-more-text__button inline-show-more-text__button--light link']",
            'about': "//div[@class='inline-show-more-text full-width']",
            'experience': "//ul[@class='pvs-list']",
            'expander': "a[@class='optional-action-target-wrapper artdeco-button artdeco-button--tertiary artdeco-button--standard artdeco-button--2 artdeco-button--muted inline-flex justify-center full-width align-items-center artdeco-button--fluid ']"}
profiles = []
for n in links:
    for m in n:
        driver.get(m)
        time.sleep(2)
        try:
            see_mores = driver.find_elements(By.XPATH, elements['see_more']) # 定位段落末尾的'查看更多'按钮以展开长描述字段
            for s in see_mores:
                s.click()
        except:
            print("没有'查看更多'按钮")
        time.sleep(1)
        try:
            expanders = driver.find_elements(By.XPATH, elements['expander']) # 定位扩展按钮以展开带有折叠数据条目的部分
            for e in expanders:
                e.click()
        except:
            print("没有扩展按钮")
        time.sleep(1)
        try:
            name = driver.find_element(By.XPATH, elements['name']).text
        except:
            name = None
        try:
            prefix = driver.find_element(By.XPATH, elements['prefix']).text
        except:
            prefix = None
        try:
            title = driver.find_element(By.XPATH, elements['title']).text
        except:
            title = None
        try:
            location = driver.find_element(By.XPATH, elements['location']).text
        except:
            location = None
        try:
            about = driver.find_element(By.XPATH, elements['about']).text
        except:
            about = None
        try:
            experience = driver.find_elements(By.XPATH, elements['experience'])[1].text
        except:
            experience = None
        profiles.append({'name': name, 'prefix': prefix, 'title': title, 'about': about, 'experience': experience})

# 存储到数据框并导出到CSV
profile_df = pd.DataFrame(profiles)
profile_df.to_csv("D:\linkedin_profiles.csv")

请注意,我已经对您的代码进行了格式化和修正,以便更容易阅读。如果您需要进一步的优化建议,请随时提出。

英文:

Objective

  • I'm creating a python program for webscraping Linkedin that will take in user login credentials and custom search query as inputs and use selenium to navigate the profiles yielded as results and extract data from specific webpage elements and store it in a pandas data frame.

Problem

  • I have provided my code below. My main concern is that it takes a long time to successfully complete a run (approx 23 mins for parsing 68 profiles). Could anyone please help me with optimizing the code for speed? Thanks!

Code:

#imports
userid = "userid@domain.com"
password = "p@Ssw0rd!"
keyword = "Master of Business Data Science Otago "
url = f"https://www.linkedin.com/search/results/people/?keywords={keyword}&origin=SWITCH_SEARCH_VERTICAL&sid=RZW"
driver = webdriver.Chrome()
driver.get("https://www.linkedin.com")
driver.implicitly_wait(6)
driver.find_element(By.XPATH,"""//*[@id="session_key"]""").send_keys(userid)
driver.find_element(By.XPATH,"""//*[@id="session_password"]""").send_keys(password)
driver.find_element(By.XPATH,"//button[@class='sign-in-form__submit-button']").click()
driver.get(url)
links = []
scroll_target = driver.find_element(By.CLASS_NAME,"background-mercado") #target for scrolling page to linked in logo at the bottom so that the 'Next' button's element becomes visible
driver.execute_script('arguments[0].scrollIntoView(true)',scroll_target)
while True:
try:
time.sleep(3) 
linky = driver.find_elements(By.CLASS_NAME,'app-aware-link ') #locating containers housing links in the page
links.append([li.get_attribute('href') for li in linky[::2] if 'miniProfileUrn'in str(li.get_attribute('href'))]) #filtering only profile links from list of all links
page_button = driver.find_element(By.XPATH, '//button[@aria-label="Next"]') #locating the 'Next' button to clik to the next page of results
page_button.click()
except:
print("No more pages")
#locating and parsing indivudual elements in each profile for profile links captured
elements = { 'name': """//h1[@class="text-heading-xlarge inline t-24 v-align-middle break-words"]""",'prefix':"""//div[@class="text-body-small v-align-middle break-words t-black--light"]""",'title':"""//div[@class="text-body-medium break-words"]""",'location':"""//div[@class="text-body-small inline t-black--light break-words"]""",'see_more':"""//button[@class="inline-show-more-text__button
inline-show-more-text__button--light
link"]""",'about':"""//div[@class="inline-show-more-text
full-width"]""",'experience':"""//ul[@class="pvs-list
"]""",'expander':"""a[@class="optional-action-target-wrapper artdeco-button artdeco-button--tertiary artdeco-button--standard artdeco-button--2 artdeco-button--muted 
inline-flex justify-center full-width align-items-center artdeco-button--fluid
"]"""}
profiles = []
for n in links:
for m in n:
driver.get(m)
time.sleep(2)
try:
see_mores = driver.find_elements(By.XPATH, elements['see_more']) #locating 'see more' button at paragraph ends for long descriptive fields to expand them
for s in see_mores:
s.click()
except:
print("No 'see more' button")
time.sleep(1)
try:
expanders = driver.find_elements(By.XPATH, elements['expander']) #locating expansion  buttons at to expand sections with collapsed data entries
for e in expanders:
e.click()
except:
print("No expanders")
time.sleep(1)
try:
name = driver.find_element(By.XPATH,elements['name']).text
except:
name = None
try:
prefix = driver.find_element(By.XPATH,elements['prefix']).text
except:
prefix = None
try:
title = driver.find_element(By.XPATH,elements['title']).text
except:
title= None
try:
location = driver.find_element(By.XPATH,elements['location']).text
except:
location = None
try:
about = driver.find_element(By.XPATH,elements['about']).text
except:
abount = None
try:    
experience = driver.find_elements(By.XPATH,elements['experience'])[1].text
except:
expereince = None
profiles.append({'name':name,'prefix':prefix,'title':title,'about':about,'experience':experience})
#storing to a data frame and exporting to a csv
profile_df = pd.DataFrame(profiles)
profile_df.to_csv("D:\linkedin_profiles.csv")

答案1

得分: 1

为了减少执行时间你可以做两件事

## WebDriverWait


    time.sleep(...)
    driver.find_element(By.XPATH, path)

替换为

    WebDriverWait(driver, seconds).until(EC.visibility_of_element_located((By.XPATH, path)))

这个命令会暂停执行直到找到元素如果找不到元素会引发异常通过替换 `time.sleep()`,你可以在每次迭代中节省几秒钟

## javascript


    driver.find_element(By.XPATH, path).text

替换为 javascript

    driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', path)

这样执行速度更快此外如果找不到元素它会返回 `None`,因此我们可以删除 `try except`

---

以下代码分为两个块第一个块用于提取个人资料链接第二个块用于提取个人资料数据第一个块花了80秒来提取40页并找到了160个链接第二个块花了82秒来提取20个个人资料我使用 `for profile,url in enumerate(links[:20]):` 限制了为20个),因此每个个人资料大约需要4秒钟因此要提取160个个人资料大约需要11分钟

关于 `elements` 中的 XPath我修改了其中一些以缩短它们或者因为它们不正确

最后我删除了关于 `see_mores``expanders` 的代码第一个是因为完整文本已经在 HTML 代码中所以不需要点击查看更多”。第二个是因为如果你点击展开一个部分那么会加载一个新页面

    url = 'https://www.linkedin.com/search/results/people/?keywords=Master+of+Business+Data+Science+Otago&origin=SWITCH_SEARCH_VERTICAL&sid=RZW'
    driver.get(url)
    
    start = time.time()
    links = []
    for page in range(1, 999):
        print(f'{page=} {len(links)=}', end='\r')
        # 等待结果加载
        WebDriverWait(driver, 9).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.search-results-container')))
        no_results_found = driver.find_elements(By.XPATH, "//div[@class='search-results-container']//h2[text()='No results found']")
        if no_results_found:
            print("\nNo results found")
            break
        # 滚动到底部
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
        page_button = WebDriverWait(driver, 9).until(EC.visibility_of_element_located((By.XPATH, '//button[@aria-label="Next"]')))
        if page_button.get_attribute('disabled'):
            print("\nNo more pages")
            break
        links += [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'div.mb1 [href*=miniProfileUrn]')]
        page_button.click()

    print(f'\nelapsed time block 1: {time.time()-start:.1f} seconds')
    
    elements = {'name'       : "//h1",
                'prefix'     : "//span[@class='text-body-small v-align-middle break-words t-black--light']",
                'title'      : "//div[@class='text-body-medium break-words']",
                'location'   : "//span[@class='text-body-small inline t-black--light break-words']",
                'about'      : "//div[@class='...']",
                'experience' : "//div[@id='experience']/following-sibling::div[2]/ul/li"}
    
    start = time.time()
    profiles = []
    for profile, url in enumerate(links):
        print(f'profile {profile+1}', end='\r')
        driver.get(url)
        # 等待“经历”部分加载
        WebDriverWait(driver, 9).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'section:has(#experience)')))
        name     = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['name'])
        prefix   = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['prefix'])
        title    = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['title'])
        location = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['location'])
        about    = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['about'])
        experience = driver.execute_script("query = document.evaluate(arguments[0], document, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);"+
                                           "var nodes = [];" + 
                                           "while (node = query.iterateNext()) {" +
                                           "    nodes.push(node.innerText);" + 
                                           "}"+
                                           "return nodes;", elements['experience'])
        profiles.append({'name':name,'prefix':prefix,'title':title,'location':location,'about':about,'experience':experience})

    print(f'\nelapsed time block 2: {time.time()-start:.1f} seconds')
    
    pd.DataFrame(profiles)
英文:

To reduce execution time you can do two things.

WebDriverWait

Replace

time.sleep(...)
driver.find_element(By.XPATH, path)

with

WebDriverWait(driver,seconds).until(EC.visibility_of_element_located((By.XPATH, path)))

This commands pauses execution until the element is found, and raises an exception if element is not found. By replacing time.sleep() you can save few seconds at each iteration.

javascript

Replace

driver.find_element(By.XPATH, path).text

with javascript

driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', path)

which executes way faster. Moreover, it returns None if it doesn't find the element, hence we can remove the try except blocks.


The code below is divided into two blocks. First one scrapes profile links, the second one scrapes profile data. First block took 80 seconds to scrape 40 pages and found 160 links. Second block took 82 seconds to scrape 20 profiles (I limited to 20 by using for profile,url in enumerate(links[:20]):), so about 4 seconds per profile. So to scrape 160 profiles it would take about 11 minutes.

About the xpaths in elements, I changed some of them to shorten them or because they were not correct.

Finally, I removed the code about see_mores and expanders. The first one because the full text is already present in the HTML code, so it is not needed to click "see more". The second one because if you click to expand a section, then a new page is loaded.

url = 'https://www.linkedin.com/search/results/people/?keywords=Master+of+Business+Data+Science+Otago&origin=SWITCH_SEARCH_VERTICAL&sid=RZW'
driver.get(url)
start = time.time()
links = []
for page in range(1,999):
print(f'{page=} {len(links)=}', end='\r')
# wait until the results are loaded
WebDriverWait(driver,9).until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.search-results-container')))
no_results_found = driver.find_elements(By.XPATH, "//div[@class='search-results-container']//h2[text()='No results found']")
if no_results_found:
print("\nNo results found")
break
# scroll to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
page_button = WebDriverWait(driver,9).until(EC.visibility_of_element_located((By.XPATH, '//button[@aria-label="Next"]')))
if page_button.get_attribute('disabled'):
print("\nNo more pages")
break
links += [a.get_attribute('href') for a in driver.find_elements(By.CSS_SELECTOR, 'div.mb1 [href*=miniProfileUrn]')]
page_button.click()
print(f'\nelapsed time block 1: {time.time()-start:.1f} seconds')
elements = {'name'       : "//h1",
'prefix'     : "//span[@class='text-body-small v-align-middle break-words t-black--light']",
'title'      : "//div[@class='text-body-medium break-words']",
'location'   : "//span[@class='text-body-small inline t-black--light break-words']",
'about'      : "//div[@class='...']",
'experience' : "//div[@id='experience']/following-sibling::div[2]/ul/li"}
start = time.time()
profiles = []
for profile,url in enumerate(links):
print(f'profile {profile+1}', end='\r')
driver.get(url)
# wait until the section "Experience" is loaded
WebDriverWait(driver, 9).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'section:has(#experience)')))
name     = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['name'])
prefix   = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['prefix'])
title    = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['title'])
location = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['location'])
about    = driver.execute_script('return document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue?.innerText;', elements['about'])
experience = driver.execute_script("query = document.evaluate(arguments[0], document, null, XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);"+
"var nodes = [];" + 
"while (node = query.iterateNext()) {" +
"    nodes.push(node.innerText);" + 
"}"+
"return nodes;", elements['experience'])
profiles.append({'name':name,'prefix':prefix,'title':title,'location':location,'about':about,'experience':experience})
print(f'\nelapsed time block 2: {time.time()-start:.1f} seconds')
pd.DataFrame(profiles)

output

page=41 len(links)=160
No results founds
elapsed time block 1: 80.2 seconds
profile=20
elapsed time block 2: 81.8 seconds

使用Selenium和Python解析从搜索查询中获取的LinkedIn个人资料页面。

huangapple
  • 本文由 发表于 2023年2月23日 21:29:09
  • 转载请务必保留本文链接:https://go.coder-hub.com/75545515.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定