2023年7月27日 19:14:08go评论187阅读模式

英文:

Adding multiple tags in one result in BeautifulSoup

问题

adress = company.find_all('span')[1].text + company.find_all('span')[2].text + company.find_all('span')[3].text + company.find_all('span')[4].text

英文:

In the line when defining adress I want to add 4 results from different <span> tags into 1. But I cannot figure out how to do so.

`from bs4 import BeautifulSoup
import requests, openpyxl
#excel =  openpyxl.Workbook()
#print(excel.sheetnames)
#sheet = excel.active
#sheet.title = &#39;KNX manufacturers&#39;
#print(excel.sheetnames)
#sheet.append([&#39;name&#39;, &#39;description&#39;, &#39;country&#39;, &#39;website&#39;, &#39;phone&#39;, &#39;CEO&#39;, &#39;adress&#39;])
try:
	source = requests.get(&#39;https://www.knx.org/knx-en/for-professionals/community/manufacturers/&#39;)
	source.raise_for_status()
	soup = BeautifulSoup(source.text,&#39;html.parser&#39;)
	companys = soup.find(&#39;div&#39;, class_=&quot;accordion&quot;).find_all(&#39;li&#39;)
	
	for company in companys:
		name = company.find(&#39;span&#39;, class_=&quot;desktop_only&quot;).text
		description = company.find(&#39;div&#39;, class_=&quot;col-md-6 col-sm-12 col-xs-12&quot;).text.strip()
		country = company.find(&#39;div&#39;, class_=&quot;col-lg-4 col-sm-6 col-xs-6 item&quot;).span.text
		website = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).a.text
		phone = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).find_all(&#39;span&#39;)[5].text.strip(&#39;Phone: &#39;)
		CEO = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).find_all(&#39;strong&#39;)[0].text.strip()
		adress = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).find_all(&#39;span&#39;)[1, 2, 3, 4].text
		print(adress)
		#print(name, description, country, website, phone, CEO, adress)
		#sheet.append([name, description, country, website, phone, CEO, adress])
		break
except Exception as e:
	print(e)
#excel.save(&#39;KNX manufacturers | Bemsiq.xlsx&#39;)`

I tried adding find_all('span')[1, 2, 3, 4] but it can only show one of these results at the time.

答案1

得分: 1

要将所有数据从表格提取到pandas DataFrame中，您可以尝试：
```py
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'https://www.knx.org/knx-en/for-professionals/community/manufacturers/'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
def get_text(tag):
    return re.sub(r'\s{2,}', ' ', tag.text.strip())
all_data = []
for a in soup.select('.accordion-body'):
    name = a.select_one('.mobile_only').text
    phone = a.select_one('strong:-soup-contains("Phone:")')
    phone = phone.find_next_sibling(string=True) if phone else '-'
    web = a.select_one('strong:-soup-contains("Website:") + a')
    web = web['href'] if web else '-'
    email = a.select_one('strong:-soup-contains("Email:") + a')
    email = email['href'] if email else '-'
    person = a.strong
    line1 = person.find_next_sibling("span")
    line2 = line1.find_next_sibling("span")
    line3 = line2.find_next_sibling("span")
    address = f'{get_text(line1)}, {get_text(line2)}, {get_text(line3)}'
    person = person.text
    all_data.append((name, phone, web, email, person, address))
df = pd.DataFrame(all_data, columns=['Name', 'Phone', 'Web', 'Email', 'Person', 'Address'])
print(df.head(10).to_markdown(index=False))

打印结果：

Name	Phone	Web	Email	Person	Address
[mn]medianet	+49.6103697784	https://www.medianet-home.de/	mailto:info@medianet-home.de	Bernhard Hnida	Am Taubhaus 29, 63303 Dreieich, Germany
1Home Solutions GmbH	+386.51300606	http://www.1home.io	mailto:dejan.bukovnik@voxior.com	Dejan Bukovnik	Wattstrasse 11, 13355 Berlin, Germany
3domotic Global Systems, S.L.	+34.610991993	-	mailto:software@3atel.com	Santiago Ribas Roca	Enric Granados 153, Pral Bis - 1, 08008 Barcelona, Spain
4ba GmbH	+49.6461980440	http://www.4ba-gmbh.de	mailto:oh@dacom-homeautomation.de	Oliver Herrmann	Am Roten Stein 9A, 35216 Biedenkopf, Germany
ABB France	+33.561151845	http://www.abb.com	mailto:cathy.zotti@fr.abb.com	Serge Le Men	rue Paul Gauguin 33, 31100 Toulouse, France
ABB LV Installation Materials Company Limited, Beijing	+86.1058085086	http://www.abb.com.cn	mailto:yuwei.dai@cn.abb.com	Yuwei Dai	Kangding Street No. 17, 100176 Beijing, China
ABB S.p.A.	+39.35395269	http://www.abb.com/it	mailto:marco.simonella@it.abb.com	Marco Simonella	Via delle Industrie 18, 20010 Vittuone, Italy
ABB Schweiz Ltd.	+41 58/5864566	http://www.levyfils.ch	mailto:tudor.baiatu@ch.abb.com	Tudor Baiatu	Fulachstrasse 150, 8201 Schaffhausen, Switzerland
ABB Stotz-Kontakt GmbH	+49.62217011357	http://www.abb.com/knx	mailto:volker.biewendt@de.abb.com	Volker Biewendt	Eppelheimer Straße 82, 69123 Heidelberg, Germany
ABB Xiamen Smart Technology Co., Ltd.	+86.5927616016	https://new.abb.com/	mailto:sylvia-shanshan.wang@cn.abb.com	Sylvia Wang	FangShanXiEr Road No.881, 361000 Xiamen, China


<details>
<summary>英文:</summary>
To get all data from the table to pandas DataFrame you can try:
```py
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = &#39;https://www.knx.org/knx-en/for-professionals/community/manufacturers/&#39;
soup = BeautifulSoup(requests.get(url).content, &#39;html.parser&#39;)
def get_text(tag):
    return re.sub(r&#39;\s{2,}&#39;, &#39; &#39;, tag.text.strip())
all_data = []
for a in soup.select(&#39;.accordion-body&#39;):
    name = a.select_one(&#39;.mobile_only&#39;).text
    phone = a.select_one(&#39;strong:-soup-contains(&quot;Phone:&quot;)&#39;)
    phone = phone.find_next_sibling(string=True) if phone else &#39;-&#39;
    web = a.select_one(&#39;strong:-soup-contains(&quot;Website:&quot;) + a&#39;)
    web = web[&#39;href&#39;] if web else &#39;-&#39;
    email = a.select_one(&#39;strong:-soup-contains(&quot;Email:&quot;) + a&#39;)
    email = email[&#39;href&#39;] if email else &#39;-&#39;
    person = a.strong
    line1 = person.find_next_sibling(&quot;span&quot;)
    line2 = line1.find_next_sibling(&quot;span&quot;)
    line3 = line2.find_next_sibling(&quot;span&quot;)
    address = f&#39;{get_text(line1)}, {get_text(line2)}, {get_text(line3)}&#39;
    person = person.text
    all_data.append((name, phone, web, email, person, address))
df = pd.DataFrame(all_data, columns=[&#39;Name&#39;, &#39;Phone&#39;, &#39;Web&#39;, &#39;Email&#39;, &#39;Person&#39;, &#39;Address&#39;])
print(df.head(10).to_markdown(index=False))

Prints:

Name	Phone	Web	Email	Person	Address
[mn]medianet	+49.6103697784	https://www.medianet-home.de/	mailto:info@medianet-home.de	Bernhard Hnida	Am Taubhaus 29, 63303 Dreieich, Germany
1Home Solutions GmbH	+386.51300606	http://www.1home.io	mailto:dejan.bukovnik@voxior.com	Dejan Bukovnik	Wattstrasse 11, 13355 Berlin, Germany
3domotic Global Systems, S.L.	+34.610991993	-	mailto:software@3atel.com	Santiago Ribas Roca	Enric Granados 153, Pral Bis - 1, 08008 Barcelona, Spain
4ba GmbH	+49.6461980440	http://www.4ba-gmbh.de	mailto:oh@dacom-homeautomation.de	Oliver Herrmann	Am Roten Stein 9A, 35216 Biedenkopf, Germany
ABB France	+33.561151845	http://www.abb.com	mailto:cathy.zotti@fr.abb.com	Serge Le Men	rue Paul Gauguin 33, 31100 Toulouse, France
ABB LV Installation Materials Company Limited, Beijing	+86.1058085086	http://www.abb.com.cn	mailto:yuwei.dai@cn.abb.com	Yuwei Dai	Kangding Street No. 17, 100176 Beijing, China
ABB S.p.A.	+39.35395269	http://www.abb.com/it	mailto:marco.simonella@it.abb.com	Marco Simonella	Via delle Industrie 18, 20010 Vittuone, Italy
ABB Schweiz Ltd.	+41 58/5864566	http://www.levyfils.ch	mailto:tudor.baiatu@ch.abb.com	Tudor Baiatu	Fulachstrasse 150, 8201 Schaffhausen, Switzerland
ABB Stotz-Kontakt GmbH	+49.62217011357	http://www.abb.com/knx	mailto:volker.biewendt@de.abb.com	Volker Biewendt	Eppelheimer Straße 82, 69123 Heidelberg, Germany
ABB Xiamen Smart Technology Co., Ltd.	+86.5927616016	https://new.abb.com/	mailto:sylvia-shanshan.wang@cn.abb.com	Sylvia Wang	FangShanXiEr Road No.881, 361000 Xiamen, China

答案2

得分: 0

错误的原因是您不能直接将索引列表传递给find_all()，如您尝试的那样。相反，您应该使用列表推导或单独的find_all()调用来提取不同标签的结果，然后将它们连接成一个单一的字符串。

以下是如何修改您的代码以实现这一点：

from bs4 import BeautifulSoup
import requests
try:
    source = requests.get('https://www.knx.org/knx-en/for-professionals/community/manufacturers/')
    source.raise_for_status()
    soup = BeautifulSoup(source.text, 'html.parser')
    companies = soup.find('div', class_='accordion').find_all('li')
    for company in companies:
        name = company.find('span', class_='desktop_only').text
        description = company.find('div', class_='col-md-6 col-sm-12 col-xs-12').text.strip()
        country = company.find('div', class_='col-lg-4 col-sm-6 col-xs-6 item').span.text
        website = company.find('div', class_='col-sm-6 col-xs-12').a.text
        phone = company.find('div', class_='col-sm-6 col-xs-12').find_all('span')[5].text.strip('Phone: ')
        CEO = company.find('div', class_='col-sm-6 col-xs-12').find_all('strong')[0].text.strip()
        # 将地址组件单独提取出来
        address_parts = company.find('div', class_='col-sm-6 col-xs-12').find_all('span')[1:5]
        address = ', '.join(part.text for part in address_parts)
        print(address)
        # 进行进一步处理或保存到Excel
        break
except Exception as e:
    print(e)

通过这种修改，地址组件（span元素1到4）将被单独提取，然后使用', ' .join()连接在一起，创建一个包含所有地址部分的单一地址字符串。

现在您应该可以看到每家公司的完整地址打印出来，并可以根据需要进行进一步处理或保存到Excel文件。

英文:

The reason you're getting an error is that you cannot pass a list of indices directly inside find_all() as you attempted. Instead, you should use a list comprehension or separate find_all() calls to extract the results from different tags and then concatenate them into a single string.

Here's how you can modify your code to achieve that:

from bs4 import BeautifulSoup
import requests
try:
    source = requests.get(&#39;https://www.knx.org/knx-en/for-professionals/community/manufacturers/&#39;)
    source.raise_for_status()
    soup = BeautifulSoup(source.text,&#39;html.parser&#39;)
    companys = soup.find(&#39;div&#39;, class_=&quot;accordion&quot;).find_all(&#39;li&#39;)
    
    for company in companys:
        name = company.find(&#39;span&#39;, class_=&quot;desktop_only&quot;).text
        description = company.find(&#39;div&#39;, class_=&quot;col-md-6 col-sm-12 col-xs-12&quot;).text.strip()
        country = company.find(&#39;div&#39;, class_=&quot;col-lg-4 col-sm-6 col-xs-6 item&quot;).span.text
        website = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).a.text
        phone = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).find_all(&#39;span&#39;)[5].text.strip(&#39;Phone: &#39;)
        CEO = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).find_all(&#39;strong&#39;)[0].text.strip()
        # Extract the address components separately
        address_parts = company.find(&#39;div&#39;, class_=&quot;col-sm-6 col-xs-12&quot;).find_all(&#39;span&#39;)[1:5]
        address = &#39;, &#39;.join(part.text for part in address_parts)
        print(address)
        # Do further processing or saving to Excel
        break
except Exception as e:
    print(e)

With this modification, the address components (span elements 1 to 4) are extracted separately and then joined together with a comma using ', '.join(), creating a single address string containing all the address parts.

Now you should see the complete address printed for each company, and you can proceed with further processing or saving to an Excel file as desired.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

在BeautifulSoup中一次添加多个标签

问题

In the line when defining adress I want to add 4 results from different <span> tags into 1. But I cannot figure out how to do so.

答案1

答案2

移除从Python爬取的提取文本中的空格和换行符。

如何将两个代码点组合成一个？

为什么当我尝试重新分配它时，我的变量（winning）没有被重新分配？

更多的标头是否意味着服务器认为你是人类的机会更大？

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。