英文:
Proxycurl API doesnt return data properly
问题
以下是您提供的代码的翻译部分:
from telnetlib import EC
import requests
from datetime import datetime
import json
import re
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from time import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests
cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})
print('- Importing packages')
# 任务 1:webdriver 配置
driver = webdriver.Chrome(ChromeDriverManager().install())
# 任务 1.1:打开 Chrome 并访问 Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initializing the Chrome driver')
sleep(2)
# 任务 1.2:导入用户名和密码
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importing credentials')
sleep(2)
# 任务 1.2:输入登录凭据
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email OK')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Password OK')
sleep(2)
# 任务 1.2:点击登录按钮
signin_field = driver.find_element(By.XPATH, '//*[@id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connecting to Linkedin')
search_field = driver.find_element(By.XPATH, '//*[@id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)
try:
driver.find_element(By.XPATH, "//*[@id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
print("Element not found")
# 获取 LinkedIn URL 的函数
def GetURL():
page_source = BeautifulSoup(driver.page_source, features='lxml')
a_elements = page_source.find_all('a', {'class': "app-aware-link"})
all_urls = []
for element in a_elements:
url = element.get('href')
all_urls.append(url)
return all_urls
# 分页
sleep(2)
input_page = int(input('Number of pages to scrape: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') # 滚动到页面底部
sleep(3)
next_button = driver.find_element(By.XPATH, '//button[contains(@class, "artdeco-pagination__button--next") and .//li-icon]')
driver.execute_script("arguments[0].click();", next_button)
sleep(2)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('Variable stores a None value')
sleep(2)
print(URLs_all_page)
sleep(1)
# 获取用户信息的函数
def get_profile_info(url):
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
params = {
'url': url,
'fallback_to_cache': 'on-error',
'use_cache': 'if-present',
'skills': 'include',
'inferred_salary': 'include',
'personal_email': 'include',
'personal_contact_number': 'include',
'twitter_profile_id': 'include',
'facebook_profile_id': 'include',
'github_profile_id': 'include',
'extra': 'include',
}
try:
response = requests.get(api_endpoint, headers=headers, params=params)
if response.status_code != 404:
data_profile = response.json()
return data_profile
else:
return None
except requests.exceptions.RequestException as e:
print (e)
return None
# 获取公司信息的函数
def get_company_info(url):
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
params = {
'resolve_numeric_id': 'true',
'categories': 'include',
'funding_data': 'include',
'extra': 'include',
'exit_data': 'include',
'acquisitions': 'include',
'url': 'include',
'use_cache': 'if-present',
}
try:
response = requests.get(api_endpoint, params={'url': url}, headers=headers)
if response.status_code == 404:
print("Company not found for URL:", url)
return None
else:
data_company = response.json()
print(data_company)
if 'extra' in data_company:
print("Extra information found:", data_company['extra'])
else:
print("No extra information found in JSON response.")
return data_company
except requests.exceptions.RequestException as e:
print (e)
return None
# 获取公司员工 URL 的函数
def get_company_employee_url(company_linkedin_profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'page_size': '10',
'linkedin_company_profile_url': company_linkedin_profile_url,
'keyword_regex': '[CCTTOO]',
'enrich_profiles': 'enrich',
'resolve_numeric_id': 'false',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
print(response.status_code)
print(response.text)
if response.status_code == 404:
print("No employees found for URL:", url)
return None
else:
data_employees = response.json()
if 'employees
<details>
<summary>英文:</summary>
Let me first explain quickly the workflow, User enters a search query -> making a search in linkedin with this query -> grabbing urls of users (in function of nb of pages) -> search for these users in proxycurl (https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint) -> grab their infos with a function -> store them in my db -> grabs infos about the experiences of the scraped users -> make a search in proxycurl API again but for the companies this time -> grab infos about companies and store them in db -> search infos about employees in this company (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> grab url of the CTO -> search in the contact API to grab the infos about the CTO (https://nubela.co/proxycurl/docs#contact-api-personal-contact-number-lookup-endpoint and https://nubela.co/proxycurl/docs#contact-api-personal-email-lookup-endpoint) -> store everything in database.
Ok so I manage to grab URLs, search for the users in API, but I never manage to get the 'extra' information with my code while I can grab them for the same profiles in Postman, same for `personnal_email`, `personnal_contact_number`, `github_profile_id`.
Then I manage to grab the data about the companies, but still same problem, can't retrieve the 'extra' information, or the `funding_data` or `acquisitions` even if I include them in my code.
I really don't know what's wrong with my code (I'm assuming something's wrong because everything works perfectly with Postman), and I can take a little help here (Full code below).
from telnetlib import EC
import requests
from datetime import datetime
import json
import re
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from time import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests
cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})
print('- Importation des packages')
# Task 1: webdriver configuration
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[@id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[@id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)
try:
driver.find_element(By.XPATH, "//*[@id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
print("Element not found")
def GetURL(): #function to grab linkedin urls
page_source = BeautifulSoup(driver.page_source, features='lxml')
a_elements = page_source.find_all('a', {'class': "app-aware-link"})
all_urls = []
for element in a_elements:
url = element.get('href')
all_urls.append(url)
return all_urls
##Pagination
sleep(2)
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page
sleep(3)
next_button = driver.find_element(By.XPATH, '//button[contains(@class, "artdeco-pagination__button--next") and .//li-icon]')
driver.execute_script("arguments[0].click();", next_button)
sleep(2)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
sleep(1)
def get_profile_info(url): # function to make api calls for users
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
params = {
'url': url,
'fallback_to_cache': 'on-error',
'use_cache': 'if-present',
'skills': 'include',
'inferred_salary': 'include',
'personal_email': 'include',
'personal_contact_number': 'include',
'twitter_profile_id': 'include',
'facebook_profile_id': 'include',
'github_profile_id': 'include',
'extra': 'include',
}
try:
response = requests.get(api_endpoint, headers=headers, params=params)
if response.status_code != 404:
data_profile = response.json()
return data_profile
else:
return None
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_info(url): #function to make api calls for companies
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
params = {
'resolve_numeric_id': 'true',
'categories': 'include',
'funding_data': 'include',
'extra': 'include',
'exit_data': 'include',
'acquisitions': 'include',
'url': 'include',
'use_cache': 'if-present',
}
try:
response = requests.get(api_endpoint, params={'url':url}, headers=headers)
if response.status_code == 404:
print("Company not found for URL:", url)
return None
else:
data_company = response.json()
print(data_company)
if 'extra' in data_company:
print("Extra information found:", data_company['extra'])
else:
print("No extra information found in JSON response.")
return data_company
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_employee_url(company_linkedin_profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'page_size': '10',
'linkedin_company_profile_url': company_linkedin_profile_url,
'keyword_regex': '[Cc][Tt][Oo]',
'enrich_profiles': 'enrich',
'resolve_numeric_id': 'false',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
print(response.status_code)
print(response.text)
if response.status_code == 404:
print("No employees found for URL:", url)
return None
else:
data_employees = response.json()
if 'employees' in data_employees:
print("Employees found:", data_employees['employee_search_results'])
else:
print("No employees found in JSON response.")
#return and store profile_url in data_employees:
for employee in data_employees['employee_search_results']:
profile_url = employee['profile_url']
print(profile_url)
def get_company_employee_info(profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
# Initialize visited URLs + data_list
visited_urls = []
for url in URLs_all_page:
if url in visited_urls:
print("Profile already exists in the database for URL:", url)
continue
data = get_profile_info(url)
if data and "error" in data:
print(data["error"])
if not data or "experiences" not in data:
continue
data["search_query"] = search_query # Add the search_query to the data
db.reference('profiles').push(data) # Store data in the candidates table
visited_urls.append(url)
print("Profile data and search query successfully added to the candidates table for URL:", url)
for item in data['experiences']:
company_name = str(item['company'])
company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values
company_linkedin_profile_url = item['company_linkedin_profile_url']
company_description = item['description']
company_data = get_company_info(company_linkedin_profile_url)
if company_name_push:
filtered_company = db.reference('companies/'+ company_name_push).get()
else:
continue
if filtered_company is None:
db.reference('companies').push({
'company_name': company_name_push,
'company_linkedin_profile_url': company_linkedin_profile_url,
'company_description': company_description,
'company_data': company_data
})
print("Company data successfully added for URL:", company_linkedin_profile_url)
else:
print("Company already exists in the database for URL:", company_linkedin_profile_url)
experiences = {
'candidate_name': data['full_name'],
'title': item['title'],
'company': item['company'],
'location': item['location'],
'start_date': item['starts_at'],
'end_date': item['ends_at'],
'description': item['description'],
}
db.reference('experiences').push(experiences)
company_employee_url = get_company_employee_url(company_linkedin_profile_url)
company_employee_data = get_company_employee_info(company_employee_url)
if company_employee_data:
db.reference('company_employees/' + company_name_push).push(company_employee_data)
print("Company employee data successfully added for company:", company_name)
else:
print("No data found for company employees for company:", company_name)
</details>
# 答案1
**得分**: 1
The gist is that Proxycurl API returns extra information on a best effort basis. If it does not have results, it will not be returned.
<details>
<summary>英文:</summary>
The gist is that Proxycurl API returns extra information on a best effort basis. If it does not have results, it will not be returned.
</details>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论