英文:
Save data in XML file in Python
问题
你可以尝试以下更改以确保所有五个评论都保存在文件中。首先,你可以将创建XML树的部分移到主函数内,以确保每个评论都能正确地添加到XML树中。然后,将文件写入操作移至主函数之外,以避免在每次添加评论时都覆盖文件。以下是修改后的代码:
import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET
source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)
def get_countrycitydata():
countrycity_dict = dict()
country_data = jsonData['urqlCache']['3960485871']['data']['locations']
for data in country_data:
data1 = data['parents']
countrycity_dict["country_name"] = data1[2]['name']
countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
countrycity_dict["city_name"] = data1[0]['name']
return countrycity_dict
def get_hoteldata():
hotel_dict = dict()
locations = jsonData['urqlCache']['669061039']['data']['locations']
for data in locations:
hotel_dict["tripadvisorid_hotel"] = data['locationId']
hotel_dict["hotel_name"] = data['name']
return hotel_dict
def get_reviews():
all_dictionaries = []
for locations in jsonData['urqlCache']['669061039']['data']['locations']:
for reviews in locations['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewtitle"] = reviews['title']
reviewtext = reviews['text']
clean_reviewtext = reviewtext.replace('\n', ' ')
review_dict["reviewtext"] = clean_reviewtext
all_dictionaries.append(review_dict)
return all_dictionaries
def xml_tree(new_dict, root):
country = ET.SubElement(root, "country")
ET.SubElement(country, "name").text = new_dict["country_name"]
city = ET.SubElement(country, "city")
ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
ET.SubElement(city, "name").text = new_dict["city_name"]
hotels = ET.SubElement(city, "hotels")
hotel = ET.SubElement(hotels, "hotel")
ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
reviews = ET.SubElement(hotel, "reviews")
for review_data in new_dict["reviews"]:
review = ET.SubElement(reviews, "review")
ET.SubElement(review, "reviewid").text = str(review_data["reviewid"])
ET.SubElement(review, "reviewurl").text = review_data["reviewurl"]
ET.SubElement(review, "reviewlang").text = review_data["reviewlang"]
ET.SubElement(review, "reviewtitle").text = review_data["reviewtitle"]
ET.SubElement(review, "reviewtext").text = review_data["reviewtext"]
def main():
city_dict = get_countrycitydata()
hotel_dict = get_hoteldata()
review_list = get_reviews()
root = ET.Element("countries")
for index in range(len(review_list)):
new_dict = {**city_dict, **hotel_dict}
new_dict["reviews"] = review_list
xml_tree(new_dict, root)
tree = ET.ElementTree(root)
tree.write("test.xml", encoding='unicode')
if __name__ == "__main__":
main()
这样,你的XML树将在主函数中构建,然后一次性写入文件,确保所有五个评论都保存在同一个文件中。希望这对你有所帮助!
英文:
I am trying to save my data to an XML file. This data comes from a website where I want to collect the reviews. There are always five reviews per page, which I want to save in XML format in a file. The problem is that if I print out the XML tree with print(ET.tostring(root, encoding='utf8').decode('utf8'))
then there are all five reviews that I want to have. But if I save them into the file with tree.write("test.xml", encoding='unicode')
then I only see one review... Here is my code:
import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET
source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text
soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)
def get_countrycitydata():
countrycity_dict = dict()
country_data = jsonData['urqlCache']['3960485871']['data']['locations']
for data in country_data:
data1 = data['parents']
countrycity_dict["country_name"] = data1[2]['name']
countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
countrycity_dict["city_name"] = data1[0]['name']
return countrycity_dict
def get_hoteldata():
hotel_dict = dict()
locations = jsonData['urqlCache']['669061039']['data']['locations']
for data in locations:
hotel_dict["tripadvisorid_hotel"] = data['locationId']
hotel_dict["hotel_name"] = data['name']
return hotel_dict
def get_reviews():
all_dictionaries = []
for locations in jsonData['urqlCache']['669061039']['data']['locations']:
for reviews in locations['reviewListPage']['reviews']:
review_dict = {}
review_dict["reviewid"] = reviews['id']
review_dict["reviewurl"] = reviews['absoluteUrl']
review_dict["reviewlang"] = reviews['language']
review_dict["reviewtitle"] = reviews['title']
reviewtext = reviews['text']
clean_reviewtext = reviewtext.replace('\n', ' ')
review_dict["reviewtext"] = clean_reviewtext
all_dictionaries.append(review_dict)
return all_dictionaries
def xml_tree(new_dict): # should I change something here???
root = ET.Element("countries")
country = ET.SubElement(root, "country")
ET.SubElement(country, "name").text = new_dict["country_name"]
city = ET.SubElement(country, "city")
ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
ET.SubElement(city, "name").text = new_dict["city_name"]
hotels = ET.SubElement(city, "hotels")
hotel = ET.SubElement(hotels, "hotel")
ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
reviews = ET.SubElement(hotel, "reviews")
review = ET.SubElement(reviews, "review")
ET.SubElement(review, "reviewid").text = str(new_dict["reviewid"])
ET.SubElement(review, "reviewurl").text = new_dict["reviewurl"]
ET.SubElement(review, "reviewlang").text = new_dict["reviewlang"]
ET.SubElement(review, "reviewtitle").text = new_dict["reviewtitle"]
ET.SubElement(review, "reviewtext").text = new_dict["reviewtext"]
tree = ET.ElementTree(root)
tree.write("test.xml", encoding='unicode')
print(ET.tostring(root, encoding='utf8').decode('utf8'))
##########################################################
def main():
city_dict = get_countrycitydata()
hotel_dict = get_hoteldata()
review_list = get_reviews()
for index in range(len(review_list)):
new_dict = {**city_dict, **hotel_dict, **review_list[index]}
xml_tree(new_dict)
if __name__ == "__main__":
main()
How can I change the XML tree so that all five reviews are saved in the file? The XML file should look like this:
<countries>
<country>
<name>Schweiz</name>
<city>
<tripadvisorid>188113</tripadvisorid>
<name>Zürich</name>
<hotels>
<hotel>
<tripadvisorid>228146</tripadvisorid>
<name>Hotel Coronado</name>
<reviews>
<review>
<reviewid>672052111</reviewid>
<reviewurl>https://www.tripadvisor.ch/ShowUserReviews-g188113-d228146-r672052111-Coronado Hotel-Zurich.html</reviewurl>
<reviewlang>de</reviewlang>
<reviewtitle>Optimale Lage und Preis</reviewtitle>
<reviewtext>Hervorragendes Hotel.Beste Erfahrun mit Service und Zimme.Die Qalität der Betten ist optimalr. Zimmer sind trotz geringer Größe sehr gut ausgestattet.Der Föhn war in diesem Fall (nicht in früheren)etwas lahm</reviewtext>
</review>
<review>
second review here ...
</review>
<review>
third review here ...
</review>
...
</reviews>
</hotel>
</hotels>
</city>
</country>
</countries>
Thank you in advance for all suggestions!
答案1
得分: 2
因为你的 xml_tree(new_dict)
存在于一个 for
循环内,tree.write()
方法被多次调用,覆盖了你的文件。
在 open()
中以 a
(追加)模式打开你的文件:
tree.write(open('test.xml', 'a'), encoding='unicode')
请查看文档 此处
英文:
Because your xml_tree(new_dict)
exists inside of a for
loop, the tree.write()
method is being called multiple times overwriting your file.
Open your file in a
(append) mode with open()
:
tree.write(open('test.xml', 'a'), encoding='unicode')
See documentation here
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论