英文:
I get the same output in for loop
问题
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
英文:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd
s=Service("C:\selenium driver\chromedriver.exe")
driver = webdriver.Chrome(service=s)
companies_names = []
persons_names = []
phones_numbers = []
locations = []
opening_hours = []
descriptions = []
websites_links = []
all_profiles = []
driver.get("https://www.saveface.co.uk/search/")
driver.implicitly_wait(10)
blocks = driver.find_elements(By.XPATH, "//div[@class='result clientresult']")
for block in range(30):
company_name = blocks[block].find_element(By.XPATH, "//h3[@class='resulttitle']").text.strip()
companies_names.append(company_name)
person_name = blocks[block].find_element(By.XPATH, "//p[@class='name_wrapper']").text.strip()
persons_names.append(person_name)
phone_number = blocks[block].find_element(By.XPATH, "//div[@class='searchContact phone']").text.strip()
phones_numbers.append(phone_number)
location = blocks[block].find_element(By.XPATH, "//li[@class='cls_loc']").text.strip()
locations.append(location)
opening_hour = blocks[block].find_element(By.XPATH, "//li[@class='opening-hours']").text.strip()
opening_hours.append(opening_hour)
profile = blocks[block].find_element(By.XPATH, "//a[@class='visitpage']").get_attribute("href")
all_profiles.append(profile)
print(company_name, person_name, phone_number, location, opening_hour, profile)
if block == 29:
two_page = driver.find_element(By.XPATH, "//a[@class='facetwp-page']")
two_page.click()
driver.implicitly_wait(10)
blocks = driver.find_elements(By.XPATH, "//div[@class='result clientresult']")
for i in range(len(all_profiles)):
driver.get(all_profiles[i])
description = driver.find_element(By.XPATH, "//div[@class='desc-text-left']").text.strip()
descriptions.append(description)
website_link = driver.find_element(By.XPATH, "//a[@class='visitwebsite website']").get_attribute("href")
websites_links.append(website_link)
driver.implicitly_wait(10)
driver.close()
df = pd.DataFrame(
{
"company_name": companies_names,
"person_name": persons_names,
"phone_number": phones_numbers,
"location": locations,
"opening_hour": opening_hours,
"description": descriptions,
"website_link": websites_links,
"profile_on_saveface": all_profiles
}
)
df.to_csv('saveface.csv',index=False)
#print(df)
This is the result:
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
The Hartley Clinic Clinic Contact: Ailing Jeavons 01256 856289 , , Fleet, RG27 8NZ Monday 8:30 — 17:00 Tuesday 8:30 — 19:00 Wednesday 8:30— 17:00 Thursday 8:30 — 17:00 Friday 8:30 — 15:00 Saturday 9:00 — 17:00 Sunday Closed https://www.saveface.co.uk/clinic/the-hartley-clinic/
答案1
得分: 1
为了将搜索限制在以上下文节点为根的子树中,您的表达式应该以.//
开头,因此您需要在每个命令中用.//
替换//
。
... = blocks[block].find_element(...)
//
的意思是从文档的根开始搜索,完全忽略上下文节点blocks[block]
。
此外,请注意,并非所有的块都具有位置,如您可以从此图中看到:
[![enter image description here][1]][1]
在这种情况下,
location = blocks[block].find_element(By.XPATH, "//li[@class='cls_loc']")
将引发NoSuchElementException
异常。为了避免这种情况,您需要将命令放在try...except...
块中。
更新
使用Selenium从400个块中进行爬取在我的电脑上需要约1分钟,我尝试使用BeautifulSoup,只需要不到1秒!爬取慢的部分是爬取个人资料,因为对于每个个人资料,我们都必须下载一个新的网页,但即使使用BeautifulSoup,速度仍然要快得多。
因此,我编写了一个不使用Selenium,只使用BeautifulSoup的脚本(您可以在终端中运行pip install beautifulsoup4
来安装它):
import requests
from bs4 import BeautifulSoup
url = 'https://www.saveface.co.uk/search/'
soup = BeautifulSoup(requests.get(url).text, "html.parser")
css_selector = {
'company name' : ".title",
'person name' : ".name_wrapper",
'phone number' : ".phone",
'location' : ".cls_loc",
'opening hours': ".opening-hours",
'profile link' : ".visitpage",
}
data = {key:[] for key in list(css_selector)+['description','website link']}
number_of_pages = int(str(soup).split('total_pages":')[1].split('}')[0])
for page in range(2,number_of_pages+2):
blocks = soup.select('.clientresult')
for idx,block in enumerate(blocks):
print(f'blocks {idx+1}/{len(blocks)}',end='\r')
for key in list(css_selector):
try:
if 'link' in key:
data[key] += [ block.select_one(css_selector[key])['href'] ]
else:
data[key] += [ block.select_one(css_selector[key]).text.strip().replace('\r\n', ', ') ]
except AttributeError:
data[key] += ['*missing value']
if page <= number_of pages:
print('\nloading page', page)
url_page = f'{url}?fwp_paged={page}&'
soup = BeautifulSoup(requests.get(url_page).text, "html.parser")
print('\nno more pages to load, moving to scrape profile links...')
for idx,url in enumerate(data['profile link']):
print(f"profile link {idx+1}/{len(data['profile link'])} ",end='\r')
soup_profile = BeautifulSoup(requests.get(url).text, "html.parser")
try:
data['description'] += [soup_profile.select_one('.clinicContent > .description').text.strip()]
except AttributeError:
data['description'] += ['*missing value']
try:
data['website link'] += [soup_profile.select_one('.visitwebsite')['href']]
except AttributeError:
data['website link'] += ['*missing value']
Output(执行完需要约8分钟)
blocks 400/400
loading page 2
blocks 109/109
no more pages to load, moving to scrape profile links...
profile link 509/509
然后,您可以轻松通过运行`pd.DataFrame(data)`来创建数据框。
<details>
<summary>英文:</summary>
To restric the search within a subtree rooted at the context node, your expression should start with `.//` so you have to replace `//` with `.//` in each of the commands
... = blocks[block].find_element(...)
The meaning of `//` is to search the document from the document's root, ignoring the context node `blocks[block]` altogether.
Moreover, notice that not all the blocks have a location as you can see from this image
[![enter image description here][1]][1]
in this case
location = blocks[block].find_element(By.XPATH, "//li[@class='cls_loc']")
will raise a `NoSuchElementException`. To avoid this you have to put the command in a `try...except...` block
# UPDATE
Scraping 400 blocks with selenium takes about 1 minute on my computer, I tried with BeautifulSoup and it just takes less than 1 second! The slow part is to scrape the profiles, because for each of them we have to download a new webpage, however is still way faster with BeautifulSoup.
So I write a script without using selenium, just BeautifulSoup (you can install by running `pip install beautifulsoup4` in the terminal)
import requests
from bs4 import BeautifulSoup
url = 'https://www.saveface.co.uk/search/'
soup = BeautifulSoup(requests.get(url).text, "html.parser")
css_selector = {
'company name' : ".title",
'person name' : ".name_wrapper",
'phone number' : ".phone",
'location' : ".cls_loc",
'opening hours': ".opening-hours",
'profile link' : ".visitpage",
}
data = {key:[] for key in list(css_selector)+['description','website link']}
number_of_pages = int(str(soup).split('total_pages":')[1].split('}')[0])
for page in range(2,number_of_pages+2):
blocks = soup.select('.clientresult')
for idx,block in enumerate(blocks):
print(f'blocks {idx+1}/{len(blocks)}',end='\r')
for key in list(css_selector):
try:
if 'link' in key:
data[key] += [ block.select_one(css_selector[key])['href'] ]
else:
data[key] += [ block.select_one(css_selector[key]).text.strip().replace('\r\n',', ') ]
except AttributeError:
data[key] += ['*missing value*']
if page <= number_of_pages:
print('\nloading page', page)
url_page = f'{url}?fwp_paged={page}'
soup = BeautifulSoup(requests.get(url_page).text, "html.parser")
print('\nno more pages to load, moving to scrape profile links...')
for idx,url in enumerate(data['profile link']):
print(f"profile link {idx+1}/{len(data['profile link'])} ",end='\r')
soup_profile = BeautifulSoup(requests.get(url).text, "html.parser")
try:
data['description'] += [soup_profile.select_one('.clinicContent > .description').text.strip()]
except AttributeError:
data['description'] += ['*missing value*']
try:
data['website link'] += [soup_profile.select_one('.visitwebsite')['href']]
except AttributeError:
data['website link'] += ['*missing value*']
Output (it took about 8 minutes to complete the execution)
blocks 400/400
loading page 2
blocks 109/109
no more pages to load, moving to scrape profile links...
profile link 509/509
Then you can easily create the dataframe by running `pd.DataFrame(data)`
[![enter image description here][2]][2]
[1]: https://i.stack.imgur.com/RLjpLm.png
[2]: https://i.stack.imgur.com/vsGn0.png
</details>
# 答案2
**得分**: 0
这是新的代码
但为什么每一页都返回相同的输出:
<details>
<summary>英文:</summary>
this is the new code
but it returns the same output on every page why:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
import pandas as pd
s=Service("C:\selenium driver\chromedriver.exe")
driver = webdriver.Chrome(service=s)
companies_names = []
persons_names = []
phones_numbers = []
locations = []
opening_hours = []
descriptions = []
websites_links = []
all_profiles = []
driver.get("https://www.saveface.co.uk/search/")
driver.implicitly_wait(10)
pages = driver.find_elements(By.XPATH, ".//a[@class='facetwp-page']")
for page in range(len(pages)+1):
blocks = driver.find_elements(By.XPATH, ".//div[@class='result clientresult']")
for block in range(10):
try:
company_name = blocks[block].find_element(By.XPATH, ".//h3[@class='resulttitle']").text.strip()
companies_names.append(company_name)
except:
companies_names.append("Not found on the site")
try:
person_name = blocks[block].find_element(By.XPATH, ".//p[@class='name_wrapper']").text.strip()
persons_names.append(person_name)
except:
persons_names.append("Not found on the site")
try:
phone_number = blocks[block].find_element(By.XPATH, ".//div[@class='searchContact phone']").text.strip()
phones_numbers.append(phone_number)
except:
phones_numbers.append("Not found on the site")
try:
location = blocks[block].find_element(By.XPATH, ".//li[@class='cls_loc']").text.strip()
locations.append(location)
except:
locations.append("Not found on the site")
try:
opening_hour = blocks[block].find_element(By.XPATH, ".//li[@class='opening-hours']").text.strip()
opening_hours.append(opening_hour)
except:
opening_hours.append("Not found on the site")
try:
profile = blocks[block].find_element(By.XPATH, ".//a[@class='visitpage']").get_attribute("href")
all_profiles.append(profile)
except:
all_profiles.append("Not found on the site")
two_page = driver.find_element(By.XPATH, ".//a[@class='facetwp-page']")
two_page.click()
for i in range(len(all_profiles)):
try:
driver.get(all_profiles[i])
driver.implicitly_wait(10)
try:
description = driver.find_element(By.XPATH, ".//div[@class='desc-text-left']").text.strip()
descriptions.append(description)
except:
descriptions.append("Not found on the site")
try:
website_link = driver.find_element(By.XPATH, ".//a[@class='visitwebsite website']").get_attribute("href")
websites_links.append(website_link)
except:
websites_links.append("Not found on the site")
except:
descriptions.append("Not found on the site")
websites_links.append("Not found on the site")
driver.implicitly_wait(10)
driver.close()
df = pd.DataFrame(
{
"company_name": companies_names,
"person_name": persons_names,
"phone_number": phones_numbers,
"location": locations,
"opening_hour": opening_hours,
"description": descriptions,
"website_link": websites_links,
"profile_on_saveface": all_profiles
}
)
df.to_csv('saveface.csv',index=False)
print(df)
</details>
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论