Python 3,bs4,网络爬虫;连接网站时出错

huangapple go评论82阅读模式
英文:

Python 3, bs4, webcrawler; error connecting too website

问题

我正在尝试构建一个针对特定网站的网络爬虫。但出于某种原因,我无法连接到该网站。我收到了一个错误消息(自己创建的),表示无法连接。我使用selenium来调用网站,但看到它无法连接。

作为一个新手,我可能犯了一个愚蠢的错误,但我无法找出问题出在哪里。希望你愿意帮助我。

import csv
import requests
import datetime
from time import sleep, time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

browser = webdriver.Chrome('C:/Users/907133/Pythonstuff/chromedriver')
browser.set_window_position(0,0)
captcha = input('Press Enter after bypassing Captcha')

# ...(以下部分未提供翻译,只提供代码)

if __name__ == '__main__':
    # 设置变量
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    output_filename = f'output_{output_timestamp}.csv'
    browser = get_driver()
    # 抓取和爬取
    while current_page <= 3:
        print(f'Scraping page #{current_page}...')
        run_process(current_page, output_filename, browser)
        current_page = current_page + 1
    # 退出
    browser.quit()
    end_time = time()
    elapsed_time = end_time - start_time
    print(f'Elapsed run time: {elapsed_time} seconds')

以上是你提供的代码的翻译部分。如果需要进一步的帮助或解释,请告诉我。

英文:

I am trying to build a web-crawler for a specific website.
But for some reason I won't connect to the website.
I get a error (made myself) it can't connect.
Using selesium tot call up the website, I see it doesn't connect

As a newbie I am probably making a stupid mistake but I can't figure out what.
Hoping you are willing to help me.

import csv
import requests
import datetime
from time import sleep, time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

browser = webdriver.Chrome(&#39;C:/Users/907133/Pythonstuff/chromedriver&#39;)
browser.set_window_position(0,0)
captcha = input(&#39;Press Enter after bypassing Captcha&#39;)

# def get_driver():
#     driver = webdriver.Chrome()
#     return driver


def get_driver():
    # initialize options
    options = webdriver.ChromeOptions()
    # pass in headless argument to options
    options.add_argument(&#39;--headless&#39;)
    # initialize driver
    driver = webdriver.Chrome(chrome_options=options)
    return driver


def connect_to_base(browser, page_number):
    base_url = f&#39;https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}&#39;
    html = None
    links = None
    connection_attempts = 0
    while connection_attempts &lt; 3:
        try:
            browser.get(base_url)
            #wait for table element with id = &#39;map&#39; to load
            #before returning True
            WebDriverWait(browser, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, &#39;result-content&#39;)))
            return True
        except Exception as ex:
            connection_attempts += 1
            print(f&#39;Error connecting to {base_url}&#39;)
            print(f&#39;Attempt #{connection_attempts}&#39;)
    return False


def parse_html(html):
        soup = BeautifulSoup(html, &#39;html.parser&#39;)
        inside = soup.find_all(&#39;a&#39;, {&#39;class&#39;:&#39;property-inner&#39;},{&#39;href&#39;})
        # Make empty lists with header lines
        output_list = []
        listing = 1
        for items in inside:
            href = items.get(&#39;href&#39;)
            url1 = href.format(page)    
        if len(browser.find_elements_by_xpath(&quot;//a[@class=&#39;CookiesOK&#39;]&quot;))&gt;0:
                browser.find_element_by_xpath(&quot;//a[@class=&#39;CookiesOK&#39;]&quot;).click()
        connection_attempts = 0
        while connection_attempts &lt; 3:
            try:
                browser.get(url1)
                WebDriverWait(browser, 5).until(
                    EC.presence_of_element_located((By.CLASS_NAME, &#39;detail-address&#39;)))
                return True
            except Exception as ex:
                connection_attempts += 1
                print(f&#39;Error connecting to {base_url}&#39;)
                print(f&#39;Attempt #{connection_attempts}&#39;)

            details = BeautifulSoup(browser.page_source, &#39;html&#39;)       
            adres = details.find_all (&#39;div&#39;, {&#39;class&#39;:&#39;detail-address&#39;})
            for adresinfo in adres:
                try:
                    adres = adres[0].get_text(separator=&#39;,&#39;, strip=True)
                except Indexerror:
                    adres = &quot;Unknown&quot;
            
            kenmerken = details.find_all (&#39;div&#39;, {&#39;class&#39;:&#39;detail-tab-content kenmerken&#39;})
            try:
                tr_kenmerken = &#39;,&#39;.join([td.text.strip() for td in kenmerken[0].select(&#39;td.value&#39;)])
            except IndexError:
                tr_kenmerken = &#39;Unknown&#39;
            
            waarde = details.find_all (&#39;div&#39;, {&#39;class&#39;:&#39;detail-tab-content woningwaarde&#39;})
            try:
                tr_waarde = &#39;,&#39;.join([td.text.strip() for td in waarde[0].select(&#39;td.value&#39;)])
            except IndexError:
                tr_waarde = &#39;Unknown&#39;
                
            informatie = {
                &#39;adres&#39;: adres, 
                &#39;kenmerken&#39;: tr_kenmerken,
                &#39;waarde&#39;: tr_waarde, 
                &#39;url&#39;: href
            }
            
            output_list.append(informatie)
            listing += 1
        return output_list


def get_load_time(article_url):
    try:
        # set headers
        headers = {&#39;User-Agent&#39;: &#39;Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36&#39;}
        # make get request to article_url
        response = requests.get(
            article_url, headers=headers, stream=True, timeout=3.000)
        # get page load time
        load_time = response.elapsed.total_seconds()
    except Exception as ex:
        load_time = &#39;Loading Error&#39;
    return load_time


def write_to_file(output_list, filename):
    for row in output_list:
        with open(filename, &#39;a&#39;) as csvfile:
            fieldnames = [&#39;adres&#39;, &#39;kenmerken&#39;, &#39;waarde&#39;, &#39;link&#39;]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow(row)


def run_process(page_number, filename, browser):
    if connect_to_base(browser, page_number):
        sleep(2)
        html = browser.page_source
        output_list = parse_html(html)
        write_to_file(output_list, filename)
    else:
        print(&#39;Error connecting to jaap&#39;)

if __name__ == &#39;__main__&#39;:
    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime(&#39;%Y%m%d%H%M%S&#39;)
    output_filename = f&#39;output_{output_timestamp}.csv&#39;
    browser = get_driver()
    # scrape and crawl
    while current_page &lt;= 3:
        print(f&#39;Scraping page #{current_page}...&#39;)
        run_process(current_page, output_filename, browser)
        current_page = current_page + 1
    # exit
    browser.quit()
    end_time = time()
    elapsed_time = end_time - start_time
    print(f&#39;Elapsed run time: {elapsed_time} seconds&#39;)

答案1

得分: 1

I see you fixed EC.presence_of_element_located((By.ID, {'class': 'result-content'})) to be EC.presence_of_element_located((By.CLASS_NAME, 'result-content')))

Next, you might have an issue with (depending where the browser is opened) of having to bypass/clicking a javascript that says you are ok and accept cookies.

But all that code seems to be an awful lot of work considering the data is stored as a json format in the script tags from the html. Why not just simply use requests, pull out the json, convert to dataframe, then write to csv?

import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize

def run_process(page_number):
    base_url = f'https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}'
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    jsonStr = soup.find('script', {'id': 'page-data'}).text
    jsonData = json.loads(jsonStr)

    df = json_normalize(jsonData['properties'])
    return df

if __name__ == '__main__':
    # set variables
    start_time = time()
    current_page = 1
    output_timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    output_filename = f'C:/test/output_{output_timestamp}.csv'

    final_df = pd.DataFrame()
    while current_page <= 3:
        print(f'Scraping page #{current_page}...')
        df = run_process(current_page)
        final_df = final_df.append(df, sort=True).reset_index(drop=True)
        current_page = current_page + 1
    final_df.to_csv(output_filename, index=False)
    end_time = time()
    elapsed_time = end_time - start_time
    print(f'Elapsed run time: {elapsed_time} seconds')

Output:

Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds

and the csv file that looks like:

        app  area                                         detailsUrl  expired       houseTypeValue        id  latLng  latLng.latitude  latLng.longitude location.city               location.street location.zipcode   lotSize market numberOfRooms openHouseDate openHouseTimes  openhouse                             photo      price  priceToShow showoffColor showoffCustomText showoffPhotoText  spotlight             status  veiling
0  False  165  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis   6899666     NaN        52.368420          4.833631     AMSTERDAM         Hof van Versailles 61           1064NX       216   sale             4          None           None      False  10014EAAF8B8883668593EFAC9E5FF1C   595000.0     595000.0         None              None             None      False               Sale    False
1   True  211  /te-koop/noord+holland/groot-amsterdam/amsterd...    False          Appartement  10585731     NaN        52.327550          4.889076     AMSTERDAM                Beysterveld 35           1083KA  Onbekend   sale             4          None           None      False  E4F9E5BC7BC90B5B92C7BD8D48B7A677   925000.0     925000.0         None              None             None      False               Sale    False
2   True  111  /te-koop/noord+holland/groot-amsterdam/amsterd...    False  Dubbele bovenwoning  11731386     NaN        52.341890          4.896053     AMSTERDAM      Uiterwaardenstraat 320 2           1079DC  Onbekend   sale             5          None           None      False  AB9F45B2CD4AD7879C5A80F18092F9D4   750000.0     750000.0         None              None             None      False  SoldConditionally    False
3  False  269  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis  11840681     NaN        52.358266          4.875508     AMSTERDAM      Korte van Eeghenstraat 4           1071ER       107   sale             9          None           None      False  A3DF2B1D426B5E4D501503C5D0E66966  3100000.0    3100000.0         None              None             None      False               Sale    False
4  False  100  /te-koop/noord+holland/groot-amsterdam/amsterd...    False         Tussenwoning  12152943     NaN        52.421245          4.899478     AMSTERDAM  Pieter A v Heijningestraat 9           1035SV        83   sale             5          None           None      False  55C6F589523FA553D67A709776DD70DD   399000.0     399000.0         None              None             None      False               Sale    False
5   True  111  /te-koop/noord+holland/groot-amsterdam/amsterd...    False          Bovenwoning  15796874     NaN              NaN               NaN     AMSTERDAM      Eerste Amstelvlietpad 20           1096GB  Onbekend   sale             3          None           None      False  AE822B627ED096310B9ECBE7756340C8  1200000.0    1200000.0         None              None             None      False               Sale    False
6   True   76  /te-koop/noord+holland/groot-amsterdam/amsterd...    False        Benedenwoning  10580650     NaN        52.346010          4.888799     AMSTERDAM       Grevelingenstraat 18 HS           1078KP  Onbekend   sale             2          None           None      False  6FD1011D917E776DCF4DA836B5FFEE3E   550000.0     550000.0         None              None             None      False  SoldConditionally    False
7  False  298  /te-koop/noord+holland/groot-amsterdam/amsterd...    False                Villa   9623182     NaN        52.330610          4.862902     AMSTERDAM                 Cannenburg 51           1081GW       
<details>
<summary>英文:</summary>
I see you fixed `EC.presence_of_element_located((By.ID,{&#39;class&#39;:&#39;result-content&#39;}))` to be `EC.presence_of_element_located((By.CLASS_NAME,&#39;result-content&#39;)))`
Next, you might have an issue with (depending where the browser is opened) of having to bypass/clicking a javascript that says you are ok and accept cookies. 
But all that code seems to be an awful lot of work considering the data is stored as a json format in the `script` tags from the html. Why not just simply use `requests`, pull out the json, convert to dataframe, then write to csv?
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
def run_process(page_number):
base_url = f&#39;https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}&#39;
response = requests.get(base_url)
soup = BeautifulSoup(response.text, &#39;html.parser&#39;)
jsonStr = soup.find(&#39;script&#39;, {&#39;id&#39;:&#39;page-data&#39;}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData[&#39;properties&#39;])
return df
if __name__ == &#39;__main__&#39;:
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime(&#39;%Y%m%d%H%M%S&#39;)
output_filename = f&#39;C:/test/output_{output_timestamp}.csv&#39;
final_df = pd.DataFrame()
while current_page &lt;= 3:
print(f&#39;Scraping page #{current_page}...&#39;)
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f&#39;Elapsed run time: {elapsed_time} seconds&#39;)
**Output:**
Scraping page #1...
Scraping page #2...
Scraping page #3...
Elapsed run time: 7.441420555114746 seconds
and the csv file that looks like:
app area                                         detailsUrl  expired       houseTypeValue        id  latLng  latLng.latitude  latLng.longitude location.city               location.street location.zipcode   lotSize market numberOfRooms openHouseDate openHouseTimes  openhouse                             photo      price  priceToShow showoffColor showoffCustomText showoffPhotoText  spotlight             status  veiling
0  False  165  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis   6899666     NaN        52.368420          4.833631     AMSTERDAM         Hof van Versailles 61           1064NX       216   sale             4          None           None      False  10014EAAF8B8883668593EFAC9E5FF1C   595000.0     595000.0         None              None             None      False               Sale    False
1   True  211  /te-koop/noord+holland/groot-amsterdam/amsterd...    False          Appartement  10585731     NaN        52.327550          4.889076     AMSTERDAM                Beysterveld 35           1083KA  Onbekend   sale             4          None           None      False  E4F9E5BC7BC90B5B92C7BD8D48B7A677   925000.0     925000.0         None              None             None      False               Sale    False
2   True  111  /te-koop/noord+holland/groot-amsterdam/amsterd...    False  Dubbele bovenwoning  11731386     NaN        52.341890          4.896053     AMSTERDAM      Uiterwaardenstraat 320 2           1079DC  Onbekend   sale             5          None           None      False  AB9F45B2CD4AD7879C5A80F18092F9D4   750000.0     750000.0         None              None             None      False  SoldConditionally    False
3  False  269  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis  11840681     NaN        52.358266          4.875508     AMSTERDAM      Korte van Eeghenstraat 4           1071ER       107   sale             9          None           None      False  A3DF2B1D426B5E4D501503C5D0E66966  3100000.0    3100000.0         None              None             None      False               Sale    False
4  False  100  /te-koop/noord+holland/groot-amsterdam/amsterd...    False         Tussenwoning  12152943     NaN        52.421245          4.899478     AMSTERDAM  Pieter A v Heijningestraat 9           1035SV        83   sale             5          None           None      False  55C6F589523FA553D67A709776DD70DD   399000.0     399000.0         None              None             None      False               Sale    False
5   True  111  /te-koop/noord+holland/groot-amsterdam/amsterd...    False          Bovenwoning  15796874     NaN              NaN               NaN     AMSTERDAM      Eerste Amstelvlietpad 20           1096GB  Onbekend   sale             3          None           None      False  AE822B627ED096310B9ECBE7756340C8  1200000.0    1200000.0         None              None             None      False               Sale    False
6   True   76  /te-koop/noord+holland/groot-amsterdam/amsterd...    False        Benedenwoning  10580650     NaN        52.346010          4.888799     AMSTERDAM       Grevelingenstraat 18 HS           1078KP  Onbekend   sale             2          None           None      False  6FD1011D917E776DCF4DA836B5FFEE3E   550000.0     550000.0         None              None             None      False  SoldConditionally    False
7  False  298  /te-koop/noord+holland/groot-amsterdam/amsterd...    False                Villa   9623182     NaN        52.330610          4.862902     AMSTERDAM                 Cannenburg 51           1081GW       651   sale             7          None           None      False  15FA170B99D4E2DEA03B6FC27E3B5B74  2495000.0    2495000.0         None              None             None      False               Sale    False
8  False  270  /te-koop/noord+holland/groot-amsterdam/amsterd...    False            Herenhuis  15791215     NaN        52.347780          5.004530     AMSTERDAM            Nico Jessekade 189           1087MR       200   sale             9          None           None      False  6EA5C0CDA0475DFC88A3A918A6B2909A  1549000.0    1549000.0         None              None             None      False  SoldConditionally    False
9  False  201  /te-koop/noord+holland/groot-amsterdam/amsterd...    False                Villa   9617942     NaN        52.377391          4.764554     AMSTERDAM               Osdorperweg 803           1067SW      1348   sale             6          None           None      False  4680429D99EC5AC47C950D57A77DF1EB   950000.0     950000.0         None              None             None      False               Sale    False
**UPDATE:**
import requests
import datetime
from time import sleep, time
from bs4 import BeautifulSoup
import json
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
def run_process(page_number):
page_number = 1
base_url = f&#39;https://www.jaap.nl/koophuizen/noord+holland/groot-amsterdam/amsterdam/p{page_number}&#39;
response = requests.get(base_url)
soup = BeautifulSoup(response.text, &#39;html.parser&#39;)
jsonStr = soup.find(&#39;script&#39;, {&#39;id&#39;:&#39;page-data&#39;}).text
jsonData = json.loads(jsonStr)
df = json_normalize(jsonData[&#39;properties&#39;])
root_URL = &#39;https://jaap.nl&#39;
df[&#39;detailsUrl&#39;] = root_URL + df[&#39;detailsUrl&#39;]
allPropDetails = pd.DataFrame()
for idx, row in df.iterrows():
propDetails = pd.DataFrame(index=[0])
w=1
detailLink = row[&#39;detailsUrl&#39;]
print (&#39;Scraping: %s&#39; %(row[&#39;location.street&#39;]))
dfs = pd.read_html(detailLink)
for each in dfs:
#each = dfs[8]
w=1
if each.isnull().all().all():
continue
each = each.dropna(axis=0, how=&#39;all&#39;)
specialCase = False
for col in list(each.columns):
if each[col].dtypes == &#39;object&#39;:
if each[col].str.contains(&#39;Voorziening&#39;).any():
specialCase = True
break
if specialCase == True:
df_obj = each.select_dtypes([&#39;object&#39;])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip(&#39;. &#39;))
cols1 = list(each.iloc[2:,0])
each = each.iloc[2:,:]
each[1] = each[1] + &#39;---&#39; + each[2]
each = each.iloc[:,-2]
each.index = cols1
each = each.to_frame().T
propRow = each
propRow.index = [0]
temp_df = pd.DataFrame(index=[0])
for col in propRow.columns:
temp_df = temp_df.merge(propRow[col].str.split(&#39;---&#39;, expand=True).rename(columns={0:col, 1:col+&#39;.distance&#39;}),left_index=True, right_index=True )
propRow = temp_df
else:
df_obj = each.select_dtypes([&#39;object&#39;])
each[df_obj.columns] = df_obj.apply(lambda x: x.str.rstrip(&#39;. &#39;))
temp_df = each.T
cols = [ temp_df.index[0] + &#39;_&#39; + colName for colName in list(temp_df.iloc[0,:]) ]
propRow = temp_df.iloc[-1,:]
propRow.index = cols
propRow = propRow.to_frame().T
propRow.index = [0]
propDetails = propDetails.merge(propRow, left_index=True, right_index=True)
propDetails.index = [idx]    
allPropDetails = allPropDetails.append(propDetails, sort=True)
df = df.merge(allPropDetails, how = &#39;left&#39;, left_index=True, right_index=True)        
return df
if __name__ == &#39;__main__&#39;:
# set variables
start_time = time()
current_page = 1
output_timestamp = datetime.datetime.now().strftime(&#39;%Y%m%d%H%M%S&#39;)
output_filename = f&#39;C:/test/output_{output_timestamp}.csv&#39;
final_df = pd.DataFrame()
while current_page &lt;= 3:
print(f&#39;Scraping page #{current_page}...&#39;)
df = run_process(current_page)
final_df = final_df.append(df, sort=True).reset_index(drop=True)
current_page = current_page + 1
final_df.to_csv(output_filename, index=False)
end_time = time()
elapsed_time = end_time - start_time
print(f&#39;Elapsed run time: {elapsed_time} seconds&#39;)
</details>

huangapple
  • 本文由 发表于 2020年1月6日 20:15:25
  • 转载请务必保留本文链接:https://go.coder-hub.com/59611946.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定