英文:
Scraping PDF files in Python with Requests and Beautifulsoup
问题
I understand that you want to improve your code for web scraping to detect PDF files even when they don't have the ".pdf" extension. To achieve this, you can modify your code as follows:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re # Import the regular expressions module
url = "https://machado.mec.gov.br/obra-completa-lista/itemlist/category/24-conto"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("a")
results = []
for link in links:
href = link.get("href")
if href is not None:
file_url = url + href
file_response = requests.head(file_url)
content_type = file_response.headers.get("Content-Type")
# Check if the URL contains ".pdf" in the path or use regular expressions
is_pdf = (
content_type == "application/pdf"
or bool(re.search(r'\.pdf$', href, re.IGNORECASE))
)
status = file_response.status_code
if status == 404: # Check if the status is 404 (Not Found)
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
else:
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
df = pd.DataFrame(results)
df
else:
print("Fail", response.status_code)
This modified code will use regular expressions to check if the URL contains ".pdf" in the path, allowing it to detect PDF files even without the ".pdf" extension in the link.
英文:
so i'm trying to make a code that make a webscrapping in a link and detect PDF files, with the data they will go to form a dataframe with these informations. My question is: I want to update the code, the code only detect links with the final extension ".pdf". How could i make him detect pdf files in links without the extension ?
My code is this:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://machado.mec.gov.br/obra-completa-lista/itemlist/category/24-conto"
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.find_all("a")
results = []
for link in links:
href = link.get("href")
if href is not None:
file_url = url + href
file_response = requests.head(file_url)
content_type = file_response.headers.get("Content-Type")
is_pdf = content_type == 'application/pdf' or href.lower().endswith('.pdf')
status = file_response.status_code
if status == 404: # Verifica se o status é 404 (Not Found)
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
else:
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
df = pd.DataFrame(results)
df
else:
print("Fail", response.status_code)
I make the code and he is running correctly, but i want to improve him.
答案1
得分: 2
使用file_url = url + href
,您正在构建服务器上不存在的URL。尝试解析包含download
的链接,然后仅将域名添加到URL (base_url
) 中:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://machado.mec.gov.br/obra-completa-lista/itemlist/category/24-conto"
base_url = 'https://machado.mec.gov.br'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.select("a[href*=download]")
results = []
for link in links:
href = link["href"]
if href.startswith('http'):
file_url = href
else:
file_url = base_url + href
file_response = requests.head(file_url)
content_type = file_response.headers.get("Content-Type")
is_pdf = content_type == 'application/pdf' or href.lower().endswith('.pdf')
status = file_response.status_code
if status == 404:
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
else:
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
df = pd.DataFrame(results)
print(df)
else:
print("Fail", response.status_code)
打印:
Link Status Arquivo PDF
0 https://machado.mec.gov.br/obra-completa-lista/item/download/31_15b64419a44a2b6ba9781ae001275ae8 200 /obra-completa-lista/item/download/31_15b64419a44a2b6ba9781ae001275ae8 True
1 https://machado.mec.gov.br/obra-completa-lista/item/download/30_8e623caa384980ca20f48a66e691074f 200 /obra-completa-lista/item/download/30_8e623caa384980ca20f48a66e691074f True
2 https://machado.mec.gov.br/obra-completa-lista/item/download/29_008edfdf58623bb13d27157722a7281e 200 /obra-completa-lista/item/download/29_008edfdf58623bb13d27157722a7281e True
3 https://machado.mec.gov.br/obra-completa-lista/item/download/28_b10fd1f9a75bcaa4573e55e677660131 200 /obra-completa-lista/item/download/28_b10fd1f9a75bcaa4573e55e677660131 True
4 https://machado.mec.gov.br/obra-completa-lista/item/download/26_29eaa69154e158508ef8374fcb50937a 200 /obra-completa-lista/item/download/26_29eaa69154e158508ef8374fcb50937a True
5 https://machado.mec.gov.br/obra-completa-lista/item/download/25_fcddef9a9bd325ad2003c64f4f4eb884 200 /obra-completa-lista/item/download/25_fcddef9a9bd325ad2003c64f4f4eb884 True
6 https://machado.mec.gov.br/obra-completa-lista/item/download/24_938f74988ddbf449047ecc5c5b575985 200 /obra-completa-lista/item/download/24_938f74988ddbf449047ecc5c5b575985 True
英文:
With file_url = url + href
you're constructing URL that doesn't exist on the server. Try to parse the links that contain download
and add only the domain to the URL (base_url
):
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://machado.mec.gov.br/obra-completa-lista/itemlist/category/24-conto"
base_url = 'https://machado.mec.gov.br'
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, "html.parser")
links = soup.select("a[href*=download]")
results = []
for link in links:
href = link["href"]
if href.startswith('http'):
file_url = href
else:
file_url = base_url + href
file_response = requests.head(file_url)
content_type = file_response.headers.get("Content-Type")
is_pdf = content_type == 'application/pdf' or href.lower().endswith('.pdf')
status = file_response.status_code
if status == 404: # Verifica se o status é 404 (Not Found)
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
else:
results.append({"Link": file_url, "Status": status, "Arquivo": href, "PDF": is_pdf})
df = pd.DataFrame(results)
print(df)
else:
print("Fail", response.status_code)
Prints:
Link Status Arquivo PDF
0 https://machado.mec.gov.br/obra-completa-lista/item/download/31_15b64419a44a2b6ba9781ae001275ae8 200 /obra-completa-lista/item/download/31_15b64419a44a2b6ba9781ae001275ae8 True
1 https://machado.mec.gov.br/obra-completa-lista/item/download/30_8e623caa384980ca20f48a66e691074f 200 /obra-completa-lista/item/download/30_8e623caa384980ca20f48a66e691074f True
2 https://machado.mec.gov.br/obra-completa-lista/item/download/29_008edfdf58623bb13d27157722a7281e 200 /obra-completa-lista/item/download/29_008edfdf58623bb13d27157722a7281e True
3 https://machado.mec.gov.br/obra-completa-lista/item/download/28_b10fd1f9a75bcaa4573e55e677660131 200 /obra-completa-lista/item/download/28_b10fd1f9a75bcaa4573e55e677660131 True
4 https://machado.mec.gov.br/obra-completa-lista/item/download/26_29eaa69154e158508ef8374fcb50937a 200 /obra-completa-lista/item/download/26_29eaa69154e158508ef8374fcb50937a True
5 https://machado.mec.gov.br/obra-completa-lista/item/download/25_fcddef9a9bd325ad2003c64f4f4eb884 200 /obra-completa-lista/item/download/25_fcddef9a9bd325ad2003c64f4f4eb884 True
6 https://machado.mec.gov.br/obra-completa-lista/item/download/24_938f74988ddbf449047ecc5c5b575985 200 /obra-completa-lista/item/download/24_938f74988ddbf449047ecc5c5b575985 True
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论