英文:
Adding multiple tags in one result in BeautifulSoup
问题
adress = company.find_all('span')[1].text + company.find_all('span')[2].text + company.find_all('span')[3].text + company.find_all('span')[4].text
英文:
In the line when defining adress I want to add 4 results from different <span> tags into 1. But I cannot figure out how to do so.
`from bs4 import BeautifulSoup
import requests, openpyxl
#excel = openpyxl.Workbook()
#print(excel.sheetnames)
#sheet = excel.active
#sheet.title = 'KNX manufacturers'
#print(excel.sheetnames)
#sheet.append(['name', 'description', 'country', 'website', 'phone', 'CEO', 'adress'])
try:
source = requests.get('https://www.knx.org/knx-en/for-professionals/community/manufacturers/')
source.raise_for_status()
soup = BeautifulSoup(source.text,'html.parser')
companys = soup.find('div', class_="accordion").find_all('li')
for company in companys:
name = company.find('span', class_="desktop_only").text
description = company.find('div', class_="col-md-6 col-sm-12 col-xs-12").text.strip()
country = company.find('div', class_="col-lg-4 col-sm-6 col-xs-6 item").span.text
website = company.find('div', class_="col-sm-6 col-xs-12").a.text
phone = company.find('div', class_="col-sm-6 col-xs-12").find_all('span')[5].text.strip('Phone: ')
CEO = company.find('div', class_="col-sm-6 col-xs-12").find_all('strong')[0].text.strip()
adress = company.find('div', class_="col-sm-6 col-xs-12").find_all('span')[1, 2, 3, 4].text
print(adress)
#print(name, description, country, website, phone, CEO, adress)
#sheet.append([name, description, country, website, phone, CEO, adress])
break
except Exception as e:
print(e)
#excel.save('KNX manufacturers | Bemsiq.xlsx')`
I tried adding find_all('span')[1, 2, 3, 4] but it can only show one of these results at the time.
答案1
得分: 1
要将所有数据从表格提取到pandas DataFrame中,您可以尝试:
```py
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.knx.org/knx-en/for-professionals/community/manufacturers/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def get_text(tag):
return re.sub(r'\s{2,}', ' ', tag.text.strip())
all_data = []
for a in soup.select('.accordion-body'):
name = a.select_one('.mobile_only').text
phone = a.select_one('strong:-soup-contains("Phone:")')
phone = phone.find_next_sibling(string=True) if phone else '-'
web = a.select_one('strong:-soup-contains("Website:") + a')
web = web['href'] if web else '-'
email = a.select_one('strong:-soup-contains("Email:") + a')
email = email['href'] if email else '-'
person = a.strong
line1 = person.find_next_sibling("span")
line2 = line1.find_next_sibling("span")
line3 = line2.find_next_sibling("span")
address = f'{get_text(line1)}, {get_text(line2)}, {get_text(line3)}'
person = person.text
all_data.append((name, phone, web, email, person, address))
df = pd.DataFrame(all_data, columns=['Name', 'Phone', 'Web', 'Email', 'Person', 'Address'])
print(df.head(10).to_markdown(index=False))
打印结果:
Name | Phone | Web | Person | Address | |
---|---|---|---|---|---|
[mn]medianet | +49.6103697784 | https://www.medianet-home.de/ | mailto:info@medianet-home.de | Bernhard Hnida | Am Taubhaus 29, 63303 Dreieich, Germany |
1Home Solutions GmbH | +386.51300606 | http://www.1home.io | mailto:dejan.bukovnik@voxior.com | Dejan Bukovnik | Wattstrasse 11, 13355 Berlin, Germany |
3domotic Global Systems, S.L. | +34.610991993 | - | mailto:software@3atel.com | Santiago Ribas Roca | Enric Granados 153, Pral Bis - 1, 08008 Barcelona, Spain |
4ba GmbH | +49.6461980440 | http://www.4ba-gmbh.de | mailto:oh@dacom-homeautomation.de | Oliver Herrmann | Am Roten Stein 9A, 35216 Biedenkopf, Germany |
ABB France | +33.561151845 | http://www.abb.com | mailto:cathy.zotti@fr.abb.com | Serge Le Men | rue Paul Gauguin 33, 31100 Toulouse, France |
ABB LV Installation Materials Company Limited, Beijing | +86.1058085086 | http://www.abb.com.cn | mailto:yuwei.dai@cn.abb.com | Yuwei Dai | Kangding Street No. 17, 100176 Beijing, China |
ABB S.p.A. | +39.35395269 | http://www.abb.com/it | mailto:marco.simonella@it.abb.com | Marco Simonella | Via delle Industrie 18, 20010 Vittuone, Italy |
ABB Schweiz Ltd. | +41 58/5864566 | http://www.levyfils.ch | mailto:tudor.baiatu@ch.abb.com | Tudor Baiatu | Fulachstrasse 150, 8201 Schaffhausen, Switzerland |
ABB Stotz-Kontakt GmbH | +49.62217011357 | http://www.abb.com/knx | mailto:volker.biewendt@de.abb.com | Volker Biewendt | Eppelheimer Straße 82, 69123 Heidelberg, Germany |
ABB Xiamen Smart Technology Co., Ltd. | +86.5927616016 | https://new.abb.com/ | mailto:sylvia-shanshan.wang@cn.abb.com | Sylvia Wang | FangShanXiEr Road No.881, 361000 Xiamen, China |
<details>
<summary>英文:</summary>
To get all data from the table to pandas DataFrame you can try:
```py
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.knx.org/knx-en/for-professionals/community/manufacturers/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def get_text(tag):
return re.sub(r'\s{2,}', ' ', tag.text.strip())
all_data = []
for a in soup.select('.accordion-body'):
name = a.select_one('.mobile_only').text
phone = a.select_one('strong:-soup-contains("Phone:")')
phone = phone.find_next_sibling(string=True) if phone else '-'
web = a.select_one('strong:-soup-contains("Website:") + a')
web = web['href'] if web else '-'
email = a.select_one('strong:-soup-contains("Email:") + a')
email = email['href'] if email else '-'
person = a.strong
line1 = person.find_next_sibling("span")
line2 = line1.find_next_sibling("span")
line3 = line2.find_next_sibling("span")
address = f'{get_text(line1)}, {get_text(line2)}, {get_text(line3)}'
person = person.text
all_data.append((name, phone, web, email, person, address))
df = pd.DataFrame(all_data, columns=['Name', 'Phone', 'Web', 'Email', 'Person', 'Address'])
print(df.head(10).to_markdown(index=False))
Prints:
Name | Phone | Web | Person | Address | |
---|---|---|---|---|---|
[mn]medianet | +49.6103697784 | https://www.medianet-home.de/ | mailto:info@medianet-home.de | Bernhard Hnida | Am Taubhaus 29, 63303 Dreieich, Germany |
1Home Solutions GmbH | +386.51300606 | http://www.1home.io | mailto:dejan.bukovnik@voxior.com | Dejan Bukovnik | Wattstrasse 11, 13355 Berlin, Germany |
3domotic Global Systems, S.L. | +34.610991993 | - | mailto:software@3atel.com | Santiago Ribas Roca | Enric Granados 153, Pral Bis - 1, 08008 Barcelona, Spain |
4ba GmbH | +49.6461980440 | http://www.4ba-gmbh.de | mailto:oh@dacom-homeautomation.de | Oliver Herrmann | Am Roten Stein 9A, 35216 Biedenkopf, Germany |
ABB France | +33.561151845 | http://www.abb.com | mailto:cathy.zotti@fr.abb.com | Serge Le Men | rue Paul Gauguin 33, 31100 Toulouse, France |
ABB LV Installation Materials Company Limited, Beijing | +86.1058085086 | http://www.abb.com.cn | mailto:yuwei.dai@cn.abb.com | Yuwei Dai | Kangding Street No. 17, 100176 Beijing, China |
ABB S.p.A. | +39.35395269 | http://www.abb.com/it | mailto:marco.simonella@it.abb.com | Marco Simonella | Via delle Industrie 18, 20010 Vittuone, Italy |
ABB Schweiz Ltd. | +41 58/5864566 | http://www.levyfils.ch | mailto:tudor.baiatu@ch.abb.com | Tudor Baiatu | Fulachstrasse 150, 8201 Schaffhausen, Switzerland |
ABB Stotz-Kontakt GmbH | +49.62217011357 | http://www.abb.com/knx | mailto:volker.biewendt@de.abb.com | Volker Biewendt | Eppelheimer Straße 82, 69123 Heidelberg, Germany |
ABB Xiamen Smart Technology Co., Ltd. | +86.5927616016 | https://new.abb.com/ | mailto:sylvia-shanshan.wang@cn.abb.com | Sylvia Wang | FangShanXiEr Road No.881, 361000 Xiamen, China |
答案2
得分: 0
错误的原因是您不能直接将索引列表传递给find_all(),如您尝试的那样。相反,您应该使用列表推导或单独的find_all()调用来提取不同标签的结果,然后将它们连接成一个单一的字符串。
以下是如何修改您的代码以实现这一点:
from bs4 import BeautifulSoup
import requests
try:
source = requests.get('https://www.knx.org/knx-en/for-professionals/community/manufacturers/')
source.raise_for_status()
soup = BeautifulSoup(source.text, 'html.parser')
companies = soup.find('div', class_='accordion').find_all('li')
for company in companies:
name = company.find('span', class_='desktop_only').text
description = company.find('div', class_='col-md-6 col-sm-12 col-xs-12').text.strip()
country = company.find('div', class_='col-lg-4 col-sm-6 col-xs-6 item').span.text
website = company.find('div', class_='col-sm-6 col-xs-12').a.text
phone = company.find('div', class_='col-sm-6 col-xs-12').find_all('span')[5].text.strip('Phone: ')
CEO = company.find('div', class_='col-sm-6 col-xs-12').find_all('strong')[0].text.strip()
# 将地址组件单独提取出来
address_parts = company.find('div', class_='col-sm-6 col-xs-12').find_all('span')[1:5]
address = ', '.join(part.text for part in address_parts)
print(address)
# 进行进一步处理或保存到Excel
break
except Exception as e:
print(e)
通过这种修改,地址组件(span元素1到4)将被单独提取,然后使用', ' .join()连接在一起,创建一个包含所有地址部分的单一地址字符串。
现在您应该可以看到每家公司的完整地址打印出来,并可以根据需要进行进一步处理或保存到Excel文件。
英文:
The reason you're getting an error is that you cannot pass a list of indices directly inside find_all() as you attempted. Instead, you should use a list comprehension or separate find_all() calls to extract the results from different tags and then concatenate them into a single string.
Here's how you can modify your code to achieve that:
from bs4 import BeautifulSoup
import requests
try:
source = requests.get('https://www.knx.org/knx-en/for-professionals/community/manufacturers/')
source.raise_for_status()
soup = BeautifulSoup(source.text,'html.parser')
companys = soup.find('div', class_="accordion").find_all('li')
for company in companys:
name = company.find('span', class_="desktop_only").text
description = company.find('div', class_="col-md-6 col-sm-12 col-xs-12").text.strip()
country = company.find('div', class_="col-lg-4 col-sm-6 col-xs-6 item").span.text
website = company.find('div', class_="col-sm-6 col-xs-12").a.text
phone = company.find('div', class_="col-sm-6 col-xs-12").find_all('span')[5].text.strip('Phone: ')
CEO = company.find('div', class_="col-sm-6 col-xs-12").find_all('strong')[0].text.strip()
# Extract the address components separately
address_parts = company.find('div', class_="col-sm-6 col-xs-12").find_all('span')[1:5]
address = ', '.join(part.text for part in address_parts)
print(address)
# Do further processing or saving to Excel
break
except Exception as e:
print(e)
With this modification, the address components (span elements 1 to 4) are extracted separately and then joined together with a comma using ', '.join(), creating a single address string containing all the address parts.
Now you should see the complete address printed for each company, and you can proceed with further processing or saving to an Excel file as desired.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论