Proxycurl API没有正确返回数据。

huangapple go评论71阅读模式
英文:

Proxycurl API doesnt return data properly

问题

以下是您提供的代码的翻译部分:

from telnetlib import EC
import requests
from datetime import datetime
import json
import re 
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag 
from time import sleep
from time import time 
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests

cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
    'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})

print('- Importing packages')
# 任务 1:webdriver 配置
driver = webdriver.Chrome(ChromeDriverManager().install())
# 任务 1.1:打开 Chrome 并访问 Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initializing the Chrome driver')
sleep(2)

# 任务 1.2:导入用户名和密码
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importing credentials')
sleep(2)

# 任务 1.2:输入登录凭据
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email OK')
sleep(3)

password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Password OK')
sleep(2)

# 任务 1.2:点击登录按钮
signin_field = driver.find_element(By.XPATH, '//*[@id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)

print('- Task A: Connecting to Linkedin')

search_field = driver.find_element(By.XPATH, '//*[@id="global-nav-typeahead"]/input')

search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)

try:
    driver.find_element(By.XPATH, "//*[@id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
    print("Element not found")

# 获取 LinkedIn URL 的函数
def GetURL():
    page_source = BeautifulSoup(driver.page_source, features='lxml')
    a_elements = page_source.find_all('a', {'class': "app-aware-link"})
    all_urls = []
    for element in a_elements:
        url = element.get('href')
        all_urls.append(url)
    return all_urls

# 分页
sleep(2)
input_page = int(input('Number of pages to scrape: '))
URLs_all_page = []
for page in range(input_page):
    URLs_one_page = GetURL()
    sleep(2)
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')  # 滚动到页面底部
    sleep(3)
    next_button = driver.find_element(By.XPATH, '//button[contains(@class, "artdeco-pagination__button--next") and .//li-icon]')
    driver.execute_script("arguments[0].click();", next_button)
    sleep(2)
    if URLs_one_page is not None:
        URLs_all_page = URLs_all_page + URLs_one_page
        print(URLs_all_page)
    else:
        print('Variable stores a None value')
        sleep(2)
        print(URLs_all_page)
sleep(1)

# 获取用户信息的函数
def get_profile_info(url):
    api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    headers = {'Authorization': 'Bearer ' + api_key}
    params = {
        'url': url,
        'fallback_to_cache': 'on-error',
        'use_cache': 'if-present',
        'skills': 'include',
        'inferred_salary': 'include',
        'personal_email': 'include',
        'personal_contact_number': 'include',
        'twitter_profile_id': 'include',
        'facebook_profile_id': 'include',
        'github_profile_id': 'include',
        'extra': 'include',
    }
    try:
        response = requests.get(api_endpoint, headers=headers, params=params)
        if response.status_code != 404:
            data_profile = response.json()
            return data_profile
        else:
            return None
    except requests.exceptions.RequestException as e:
        print (e)
        return None

# 获取公司信息的函数
def get_company_info(url):
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    headers = {'Authorization': 'Bearer ' + api_key}
    api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
    params = {
        'resolve_numeric_id': 'true',
        'categories': 'include',
        'funding_data': 'include',
        'extra': 'include',
        'exit_data': 'include',
        'acquisitions': 'include',
        'url': 'include',
        'use_cache': 'if-present',
    }
    try:
        response = requests.get(api_endpoint, params={'url': url}, headers=headers)
        if response.status_code == 404:
            print("Company not found for URL:", url)
            return None
        else:
            data_company = response.json()
            print(data_company)
            if 'extra' in data_company:
                print("Extra information found:", data_company['extra'])
            else:
                print("No extra information found in JSON response.")
            return data_company
    except requests.exceptions.RequestException as e:
        print (e)
        return None

# 获取公司员工 URL 的函数
def get_company_employee_url(company_linkedin_profile_url):
    api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
    api_key = 'SDrD73S2fXlvCMdFDExEaw'
    header_dic = {'Authorization': 'Bearer ' + api_key}
    params = {
        'page_size': '10',
        'linkedin_company_profile_url': company_linkedin_profile_url,
        'keyword_regex': '[CCTTOO]',
        'enrich_profiles': 'enrich',
        'resolve_numeric_id': 'false',
    }
    response = requests.get(api_endpoint,
                            params=params,
                            headers=header_dic)
    print(response.status_code)
    print(response.text)
    if response.status_code == 404:
        print("No employees found for URL:", url)
        return None
    else:
        data_employees = response.json()
        if 'employees

<details>
<summary>英文:</summary>

 Let me first explain quickly the workflow, User enters a search query -&gt; making a search in linkedin with this query -&gt; grabbing urls of users (in function of nb of pages) -&gt; search for these users in proxycurl (https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint) -&gt; grab their infos with a function -&gt; store them in my db -&gt; grabs infos about the experiences of the scraped users -&gt; make a search in proxycurl API again but for the companies this time -&gt; grab infos about companies and store them in db -&gt; search infos about employees in this company (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -&gt; grab url of the CTO -&gt; search in the contact API to grab the infos about the CTO (https://nubela.co/proxycurl/docs#contact-api-personal-contact-number-lookup-endpoint and https://nubela.co/proxycurl/docs#contact-api-personal-email-lookup-endpoint) -&gt; store everything in database. 

Ok so I manage to grab URLs, search for the users in API, but I never manage to get the &#39;extra&#39; information with my code while I can grab them for the same profiles in Postman, same for `personnal_email`, `personnal_contact_number`, `github_profile_id`. 
Then I manage to grab the data about the companies, but still same problem, can&#39;t retrieve the &#39;extra&#39; information, or the `funding_data` or `acquisitions` even if I include them in my code.

I really don&#39;t know what&#39;s wrong with my code (I&#39;m assuming something&#39;s wrong because everything works perfectly with Postman), and I can take a little help here (Full code below). 

    from telnetlib import EC
    import requests
    from datetime import datetime
    import json
    import re 
    from cgitb import text
    import selenium
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.common.keys import Keys
    from selenium.common.exceptions import NoSuchElementException
    from selenium.webdriver.common.action_chains import ActionChains
    from bs4 import BeautifulSoup, NavigableString, Tag 
    from time import sleep
    from time import time 
    from webdriver_manager.chrome import ChromeDriverManager
    from selenium.webdriver.chrome.options import Options
    import csv
    import firebase_admin
    from firebase_admin import credentials
    from firebase_admin import db
    import openpyxl
    import requests
    
    
    cred = credentials.Certificate(r&quot;C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json&quot;)
    firebase_admin.initialize_app(cred, {
        &#39;databaseURL&#39;: &#39;https://st-londres-2-default-rtdb.firebaseio.com/&#39;
    })
    
    
    print(&#39;- Importation des packages&#39;)
    # Task 1: webdriver configuration 
    driver = webdriver.Chrome(ChromeDriverManager().install())
    # Task 1.1: Open Chrome and Access Linkedin 
    sleep(2)
    url = &#39;https://www.linkedin.com/login&#39;
    driver.get(url)
    print(&#39;Initialisation du chrome driver&#39;)
    sleep(2)
    
    # Task 1.2: Import username and password
    credential = open(r&quot;C:\Users\radia\OneDrive\Bureau\credentials.txt&quot;)
    line = credential.readlines()
    username = line[0]
    password = line[1]
    print(&#39;Importation des id&#39;)
    sleep(2)
    
    # Task 1.2: Key in login credentials
    email_field = driver.find_element(By.ID, &#39;username&#39;)
    email_field.send_keys(username)
    print(&#39;Email ok&#39;)
    sleep(3)
    
    password_field = driver.find_element(By.NAME, &#39;session_password&#39;)
    password_field.send_keys(password)
    print(&#39;Mdp ok&#39;)
    sleep(2)
    
    # Task 1.2: Click the Login button
    signin_field = driver.find_element(By.XPATH, &#39;//*[@id=&quot;organic-div&quot;]/form/div[3]/button&#39;)
    signin_field.click()
    sleep(3)
    
    print(&#39;- Task A: Connexion &#224; Linkedin&#39;)
    
    search_field = driver.find_element(By.XPATH, &#39;//*[@id=&quot;global-nav-typeahead&quot;]/input&#39;)
    
    search_query = input(&#39;Type of profile to scrape &#39;)
    
    search_field.send_keys(search_query)
    
    search_field.send_keys(Keys.RETURN) 
    
    print(&#39;TASK B OK&#39;)
    sleep(10)
    try: 
        driver.find_element(By.XPATH, &quot;//*[@id=&#39;search-reusables__filters-bar&#39;]/ul/li[2]/button&quot;).click()
    
    except selenium.common.exceptions.NoSuchElementException:
        print(&quot;Element not found&quot;)
    
    
    def GetURL(): #function to grab linkedin urls 
        page_source = BeautifulSoup(driver.page_source, features=&#39;lxml&#39;)
        a_elements = page_source.find_all(&#39;a&#39;, {&#39;class&#39;: &quot;app-aware-link&quot;})
        all_urls = []
        for element in a_elements:
            url = element.get(&#39;href&#39;)
            all_urls.append(url)
        return all_urls
    
    ##Pagination 
    sleep(2)
    input_page = int(input(&#39;Nombre de pages &#224; scraper: &#39;))
    URLs_all_page = []
    for page in range(input_page):
        URLs_one_page = GetURL()
        sleep(2)
        driver.execute_script(&#39;window.scrollTo(0, document.body.scrollHeight);&#39;) #scrolling to the end of the page
        sleep(3)
        next_button = driver.find_element(By.XPATH, &#39;//button[contains(@class, &quot;artdeco-pagination__button--next&quot;) and .//li-icon]&#39;)
        driver.execute_script(&quot;arguments[0].click();&quot;, next_button)
        sleep(2) 
        if URLs_one_page is not None:
            URLs_all_page = URLs_all_page + URLs_one_page
            print(URLs_all_page)
        else:
            print(&#39;variable stores a None value&#39;)
            sleep(2)
            print(URLs_all_page)
    sleep(1)        
        
    def get_profile_info(url): # function to make api calls for users 
        api_endpoint = &#39;https://nubela.co/proxycurl/api/v2/linkedin&#39;
        api_key = &#39;SDrD73S2fXlvCMdFDExEaw&#39;
        headers = {&#39;Authorization&#39;: &#39;Bearer &#39; + api_key}
        params = {
            &#39;url&#39;: url,
            &#39;fallback_to_cache&#39;: &#39;on-error&#39;,
            &#39;use_cache&#39;: &#39;if-present&#39;,
            &#39;skills&#39;: &#39;include&#39;,
            &#39;inferred_salary&#39;: &#39;include&#39;,
            &#39;personal_email&#39;: &#39;include&#39;,
            &#39;personal_contact_number&#39;: &#39;include&#39;,
            &#39;twitter_profile_id&#39;: &#39;include&#39;,
            &#39;facebook_profile_id&#39;: &#39;include&#39;,
            &#39;github_profile_id&#39;: &#39;include&#39;, 
            &#39;extra&#39;: &#39;include&#39;,
        }
        try:
            response = requests.get(api_endpoint, headers=headers, params=params)
            if response.status_code != 404:
                data_profile = response.json()
                return data_profile
            else:
                return None
        except requests.exceptions.RequestException as e:
            print (e)
            return None
    
    def get_company_info(url): #function to make api calls for companies 
        api_key = &#39;SDrD73S2fXlvCMdFDExEaw&#39;
        headers = {&#39;Authorization&#39;: &#39;Bearer &#39; + api_key}
        api_endpoint = &#39;https://nubela.co/proxycurl/api/linkedin/company&#39;
        params = {
        &#39;resolve_numeric_id&#39;: &#39;true&#39;,
        &#39;categories&#39;: &#39;include&#39;,
        &#39;funding_data&#39;: &#39;include&#39;,
        &#39;extra&#39;: &#39;include&#39;,
        &#39;exit_data&#39;: &#39;include&#39;,
        &#39;acquisitions&#39;: &#39;include&#39;,
        &#39;url&#39;: &#39;include&#39;,
        &#39;use_cache&#39;: &#39;if-present&#39;,
    }
        try:
            response = requests.get(api_endpoint, params={&#39;url&#39;:url}, headers=headers)
            if response.status_code == 404:
                print(&quot;Company not found for URL:&quot;, url)
                return None
            else:
                data_company = response.json()
                print(data_company)
                if &#39;extra&#39; in data_company:
                    print(&quot;Extra information found:&quot;, data_company[&#39;extra&#39;])
                else:
                    print(&quot;No extra information found in JSON response.&quot;)
                return data_company
    
        except requests.exceptions.RequestException as e:
            print (e)
            return None 
    
    def get_company_employee_url(company_linkedin_profile_url):
        api_endpoint = &#39;https://nubela.co/proxycurl/api/linkedin/company/employee/search/&#39;
        api_key = &#39;SDrD73S2fXlvCMdFDExEaw&#39;
        header_dic = {&#39;Authorization&#39;: &#39;Bearer &#39; + api_key}
        params = {
            &#39;page_size&#39;: &#39;10&#39;,
            &#39;linkedin_company_profile_url&#39;: company_linkedin_profile_url,
            &#39;keyword_regex&#39;: &#39;[Cc][Tt][Oo]&#39;,
            &#39;enrich_profiles&#39;: &#39;enrich&#39;,
            &#39;resolve_numeric_id&#39;: &#39;false&#39;,
        }
        response = requests.get(api_endpoint,
                                params=params,
                                headers=header_dic)
        print(response.status_code)
        print(response.text)
        if response.status_code == 404:
            print(&quot;No employees found for URL:&quot;, url)
            return None
        else:
            data_employees = response.json()
            if &#39;employees&#39; in data_employees:
                print(&quot;Employees found:&quot;, data_employees[&#39;employee_search_results&#39;])
            else:
                print(&quot;No employees found in JSON response.&quot;)
            #return and store profile_url in data_employees:
            for employee in data_employees[&#39;employee_search_results&#39;]:
               profile_url = employee[&#39;profile_url&#39;]
               print(profile_url)
        
    def get_company_employee_info(profile_url):
        api_endpoint = &#39;https://nubela.co/proxycurl/api/contact-api/personal-contact&#39;
        api_key = &#39;SDrD73S2fXlvCMdFDExEaw&#39;
        header_dic = {&#39;Authorization&#39;: &#39;Bearer &#39; + api_key}
        params = {
            &#39;linkedin_profile_url&#39;: &#39;https://linkedin.com/in/test-phone-number&#39;,
        }
        response = requests.get(api_endpoint,
                                params=params,
                                headers=header_dic)
    # Initialize visited URLs + data_list 
    
    visited_urls = []
    
    for url in URLs_all_page:
        if url in visited_urls:
            print(&quot;Profile already exists in the database for URL:&quot;, url)
            continue
        data = get_profile_info(url)
        if data and &quot;error&quot; in data:
            print(data[&quot;error&quot;])
        if not data or &quot;experiences&quot; not in data:
            continue
        data[&quot;search_query&quot;] = search_query  # Add the search_query to the data
        db.reference(&#39;profiles&#39;).push(data)  # Store data in the candidates table
        
        visited_urls.append(url)
        print(&quot;Profile data and search query successfully added to the candidates table for URL:&quot;, url)
    
        for item in data[&#39;experiences&#39;]:
            company_name = str(item[&#39;company&#39;])
            company_name_push = re.sub(r&#39;[^a-zA-Z0-9]&#39;, &#39;&#39;, company_name) # Error handling when pushing code to db, replacement of illegal values 
            company_linkedin_profile_url = item[&#39;company_linkedin_profile_url&#39;]
            company_description = item[&#39;description&#39;]
            company_data = get_company_info(company_linkedin_profile_url)
            if company_name_push:
                filtered_company = db.reference(&#39;companies/&#39;+ company_name_push).get()
            else:
                continue
    
            if filtered_company is None:
                db.reference(&#39;companies&#39;).push({
                    &#39;company_name&#39;: company_name_push,
                    &#39;company_linkedin_profile_url&#39;: company_linkedin_profile_url, 
                    &#39;company_description&#39;: company_description,
                    &#39;company_data&#39;: company_data
                })
                print(&quot;Company data successfully added for URL:&quot;, company_linkedin_profile_url)
            else:
                print(&quot;Company already exists in the database for URL:&quot;, company_linkedin_profile_url)
    
            experiences = {
                &#39;candidate_name&#39;: data[&#39;full_name&#39;], 
                &#39;title&#39;: item[&#39;title&#39;], 
                &#39;company&#39;: item[&#39;company&#39;], 
                &#39;location&#39;: item[&#39;location&#39;],
                &#39;start_date&#39;: item[&#39;starts_at&#39;],
                &#39;end_date&#39;: item[&#39;ends_at&#39;],
                &#39;description&#39;: item[&#39;description&#39;],
            }
            db.reference(&#39;experiences&#39;).push(experiences) 
            
            company_employee_url = get_company_employee_url(company_linkedin_profile_url)
            company_employee_data = get_company_employee_info(company_employee_url)
    
            if company_employee_data:
                db.reference(&#39;company_employees/&#39; + company_name_push).push(company_employee_data)
                print(&quot;Company employee data successfully added for company:&quot;, company_name)
            else:
                print(&quot;No data found for company employees for company:&quot;, company_name)



</details>


# 答案1
**得分**: 1

The gist is that Proxycurl API returns extra information on a best effort basis. If it does not have results, it will not be returned.

<details>
<summary>英文:</summary>

The gist is that Proxycurl API returns extra information on a best effort basis. If it does not have results, it will not be returned.

</details>



huangapple
  • 本文由 发表于 2023年2月14日 22:04:08
  • 转载请务必保留本文链接:https://go.coder-hub.com/75448959.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定