在Python中保存数据到XML文件

huangapple go评论76阅读模式
英文:

Save data in XML file in Python

问题

你可以尝试以下更改以确保所有五个评论都保存在文件中。首先,你可以将创建XML树的部分移到主函数内,以确保每个评论都能正确地添加到XML树中。然后,将文件写入操作移至主函数之外,以避免在每次添加评论时都覆盖文件。以下是修改后的代码:

import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET

source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text

soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)

def get_countrycitydata():

    countrycity_dict = dict()

    country_data = jsonData['urqlCache']['3960485871']['data']['locations']
    for data in country_data:
        data1 = data['parents']
        countrycity_dict["country_name"] = data1[2]['name']
        countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
        countrycity_dict["city_name"] = data1[0]['name']

    return countrycity_dict

def get_hoteldata():

    hotel_dict = dict()

    locations = jsonData['urqlCache']['669061039']['data']['locations']
    for data in locations:
        hotel_dict["tripadvisorid_hotel"] = data['locationId']
        hotel_dict["hotel_name"] = data['name']

    return hotel_dict

def get_reviews():

    all_dictionaries = []

    for locations in jsonData['urqlCache']['669061039']['data']['locations']:
        for reviews in locations['reviewListPage']['reviews']:

            review_dict = {}

            review_dict["reviewid"] = reviews['id']
            review_dict["reviewurl"] = reviews['absoluteUrl']
            review_dict["reviewlang"] = reviews['language']
            review_dict["reviewtitle"] = reviews['title']
            reviewtext = reviews['text']
            clean_reviewtext = reviewtext.replace('\n', ' ')
            review_dict["reviewtext"] = clean_reviewtext

            all_dictionaries.append(review_dict)

    return all_dictionaries

def xml_tree(new_dict, root):
    country = ET.SubElement(root, "country")

    ET.SubElement(country, "name").text = new_dict["country_name"]
    city = ET.SubElement(country, "city")

    ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
    ET.SubElement(city, "name").text = new_dict["city_name"]
    hotels = ET.SubElement(city, "hotels")

    hotel = ET.SubElement(hotels, "hotel")
    ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
    ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
    reviews = ET.SubElement(hotel, "reviews")

    for review_data in new_dict["reviews"]:
        review = ET.SubElement(reviews, "review")
        ET.SubElement(review, "reviewid").text = str(review_data["reviewid"])
        ET.SubElement(review, "reviewurl").text = review_data["reviewurl"]
        ET.SubElement(review, "reviewlang").text = review_data["reviewlang"]
        ET.SubElement(review, "reviewtitle").text = review_data["reviewtitle"]
        ET.SubElement(review, "reviewtext").text = review_data["reviewtext"]

def main():

    city_dict = get_countrycitydata()
    hotel_dict = get_hoteldata()
    review_list = get_reviews()

    root = ET.Element("countries")

    for index in range(len(review_list)):
        new_dict = {**city_dict, **hotel_dict}
        new_dict["reviews"] = review_list

        xml_tree(new_dict, root)

    tree = ET.ElementTree(root)
    tree.write("test.xml", encoding='unicode')

if __name__ == "__main__":
    main()

这样,你的XML树将在主函数中构建,然后一次性写入文件,确保所有五个评论都保存在同一个文件中。希望这对你有所帮助!

英文:

I am trying to save my data to an XML file. This data comes from a website where I want to collect the reviews. There are always five reviews per page, which I want to save in XML format in a file. The problem is that if I print out the XML tree with print(ET.tostring(root, encoding='utf8').decode('utf8')) then there are all five reviews that I want to have. But if I save them into the file with tree.write("test.xml", encoding='unicode') then I only see one review... Here is my code:

import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET
source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)
def get_countrycitydata():
countrycity_dict = dict()
country_data = jsonData['urqlCache']['3960485871']['data']['locations']
for data in country_data:
data1 = data['parents']
countrycity_dict["country_name"] = data1[2]['name']
countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
countrycity_dict["city_name"] = data1[0]['name']
return countrycity_dict
def get_hoteldata():
hotel_dict = dict()
locations = jsonData['urqlCache']['669061039']['data']['locations']
for data in locations:
hotel_dict["tripadvisorid_hotel"] = data['locationId']
hotel_dict["hotel_name"] = data['name']
return hotel_dict
def get_reviews():	
all_dictionaries = []
for locations in jsonData['urqlCache']['669061039']['data']['locations']:
for reviews in locations['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] =  reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewtitle"] = reviews['title']
reviewtext = reviews['text']
clean_reviewtext = reviewtext.replace('\n', ' ')
review_dict["reviewtext"] = clean_reviewtext
all_dictionaries.append(review_dict)
return all_dictionaries
def xml_tree(new_dict): # should I change something here???
root = ET.Element("countries")
country = ET.SubElement(root, "country")
ET.SubElement(country, "name").text = new_dict["country_name"]
city = ET.SubElement(country, "city")
ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
ET.SubElement(city, "name").text = new_dict["city_name"]
hotels = ET.SubElement(city, "hotels")
hotel = ET.SubElement(hotels, "hotel")
ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
reviews = ET.SubElement(hotel, "reviews")
review = ET.SubElement(reviews, "review")
ET.SubElement(review, "reviewid").text = str(new_dict["reviewid"])
ET.SubElement(review, "reviewurl").text = new_dict["reviewurl"]
ET.SubElement(review, "reviewlang").text = new_dict["reviewlang"]
ET.SubElement(review, "reviewtitle").text = new_dict["reviewtitle"]
ET.SubElement(review, "reviewtext").text = new_dict["reviewtext"]
tree = ET.ElementTree(root)
tree.write("test.xml", encoding='unicode')	
print(ET.tostring(root, encoding='utf8').decode('utf8'))
##########################################################	
def main():
city_dict = get_countrycitydata()
hotel_dict = get_hoteldata()
review_list = get_reviews()
for index in range(len(review_list)):
new_dict = {**city_dict, **hotel_dict, **review_list[index]}
xml_tree(new_dict)
if __name__ == "__main__":
main()	

How can I change the XML tree so that all five reviews are saved in the file? The XML file should look like this:

<countries>
<country>
<name>Schweiz</name>
<city>
<tripadvisorid>188113</tripadvisorid>
<name>Zürich</name>
<hotels>
<hotel>
<tripadvisorid>228146</tripadvisorid>
<name>Hotel Coronado</name>
<reviews>
<review>
<reviewid>672052111</reviewid> 
<reviewurl>https://www.tripadvisor.ch/ShowUserReviews-g188113-d228146-r672052111-Coronado Hotel-Zurich.html</reviewurl>
<reviewlang>de</reviewlang>
<reviewtitle>Optimale Lage und Preis</reviewtitle>
<reviewtext>Hervorragendes Hotel.Beste Erfahrun mit Service und Zimme.Die Qalität der Betten ist optimalr. Zimmer sind trotz geringer Größe sehr gut ausgestattet.Der Föhn war in diesem Fall (nicht in früheren)etwas lahm</reviewtext>
</review>
<review>
second review here ...
</review>
<review>
third review here ...
</review>
...
</reviews>
</hotel>
</hotels>
</city>
</country>
</countries>

Thank you in advance for all suggestions!

答案1

得分: 2

因为你的 xml_tree(new_dict) 存在于一个 for 循环内,tree.write() 方法被多次调用,覆盖了你的文件。

open() 中以 a(追加)模式打开你的文件:

tree.write(open('test.xml', 'a'), encoding='unicode')

请查看文档 此处

英文:

Because your xml_tree(new_dict) exists inside of a for loop, the tree.write() method is being called multiple times overwriting your file.

Open your file in a (append) mode with open():

tree.write(open('test.xml', 'a'), encoding='unicode')

See documentation here

huangapple
  • 本文由 发表于 2020年1月6日 22:25:02
  • 转载请务必保留本文链接:https://go.coder-hub.com/59613778.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定