在Python中保存数据到XML文件

huangapple go评论163阅读模式
英文:

Save data in XML file in Python

问题

你可以尝试以下更改以确保所有五个评论都保存在文件中。首先,你可以将创建XML树的部分移到主函数内,以确保每个评论都能正确地添加到XML树中。然后,将文件写入操作移至主函数之外,以避免在每次添加评论时都覆盖文件。以下是修改后的代码:

import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET

source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text

soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)

def get_countrycitydata():

    countrycity_dict = dict()

    country_data = jsonData['urqlCache']['3960485871']['data']['locations']
    for data in country_data:
        data1 = data['parents']
        countrycity_dict["country_name"] = data1[2]['name']
        countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
        countrycity_dict["city_name"] = data1[0]['name']

    return countrycity_dict

def get_hoteldata():

    hotel_dict = dict()

    locations = jsonData['urqlCache']['669061039']['data']['locations']
    for data in locations:
        hotel_dict["tripadvisorid_hotel"] = data['locationId']
        hotel_dict["hotel_name"] = data['name']

    return hotel_dict

def get_reviews():

    all_dictionaries = []

    for locations in jsonData['urqlCache']['669061039']['data']['locations']:
        for reviews in locations['reviewListPage']['reviews']:

            review_dict = {}

            review_dict["reviewid"] = reviews['id']
            review_dict["reviewurl"] = reviews['absoluteUrl']
            review_dict["reviewlang"] = reviews['language']
            review_dict["reviewtitle"] = reviews['title']
            reviewtext = reviews['text']
            clean_reviewtext = reviewtext.replace('\n', ' ')
            review_dict["reviewtext"] = clean_reviewtext

            all_dictionaries.append(review_dict)

    return all_dictionaries

def xml_tree(new_dict, root):
    country = ET.SubElement(root, "country")

    ET.SubElement(country, "name").text = new_dict["country_name"]
    city = ET.SubElement(country, "city")

    ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
    ET.SubElement(city, "name").text = new_dict["city_name"]
    hotels = ET.SubElement(city, "hotels")

    hotel = ET.SubElement(hotels, "hotel")
    ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
    ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
    reviews = ET.SubElement(hotel, "reviews")

    for review_data in new_dict["reviews"]:
        review = ET.SubElement(reviews, "review")
        ET.SubElement(review, "reviewid").text = str(review_data["reviewid"])
        ET.SubElement(review, "reviewurl").text = review_data["reviewurl"]
        ET.SubElement(review, "reviewlang").text = review_data["reviewlang"]
        ET.SubElement(review, "reviewtitle").text = review_data["reviewtitle"]
        ET.SubElement(review, "reviewtext").text = review_data["reviewtext"]

def main():

    city_dict = get_countrycitydata()
    hotel_dict = get_hoteldata()
    review_list = get_reviews()

    root = ET.Element("countries")

    for index in range(len(review_list)):
        new_dict = {**city_dict, **hotel_dict}
        new_dict["reviews"] = review_list

        xml_tree(new_dict, root)

    tree = ET.ElementTree(root)
    tree.write("test.xml", encoding='unicode')

if __name__ == "__main__":
    main()

这样,你的XML树将在主函数中构建,然后一次性写入文件,确保所有五个评论都保存在同一个文件中。希望这对你有所帮助!

英文:

I am trying to save my data to an XML file. This data comes from a website where I want to collect the reviews. There are always five reviews per page, which I want to save in XML format in a file. The problem is that if I print out the XML tree with print(ET.tostring(root, encoding='utf8').decode('utf8')) then there are all five reviews that I want to have. But if I save them into the file with tree.write("test.xml", encoding='unicode') then I only see one review... Here is my code:

import requests
from bs4 import BeautifulSoup
import re
import json
import xml.etree.cElementTree as ET

source = requests.get('https://www.tripadvisor.ch/Hotel_Review-g188113-d228146-Reviews-Coronado_Hotel-Zurich.html#REVIEWS').text

soup = BeautifulSoup(source, 'lxml')
pattern = re.compile(r'window.__WEB_CONTEXT__={pageManifest:(\{.*\})};')
script = soup.find("script", text=pattern)
dictData = pattern.search(script.text).group(1)
jsonData = json.loads(dictData)
	
def get_countrycitydata():

	countrycity_dict = dict()

	country_data = jsonData['urqlCache']['3960485871']['data']['locations']
	for data in country_data:
		data1 = data['parents']
		countrycity_dict["country_name"] = data1[2]['name']
		countrycity_dict["tripadvisorid_city"] = data1[0]['locationId']
		countrycity_dict["city_name"] = data1[0]['name']
			
	return countrycity_dict
	
def get_hoteldata():
	
	hotel_dict = dict()
	
	locations = jsonData['urqlCache']['669061039']['data']['locations']
	for data in locations:
		hotel_dict["tripadvisorid_hotel"] = data['locationId']
		hotel_dict["hotel_name"] = data['name']
			
	return hotel_dict
	
def get_reviews():	
	
	all_dictionaries = []
	
	for locations in jsonData['urqlCache']['669061039']['data']['locations']:
		for reviews in locations['reviewListPage']['reviews']:
		
			review_dict = {}
			
			review_dict["reviewid"] = reviews['id']
			review_dict["reviewurl"] =  reviews['absoluteUrl']
			review_dict["reviewlang"] = reviews['language']
			review_dict["reviewtitle"] = reviews['title']
			reviewtext = reviews['text']
			clean_reviewtext = reviewtext.replace('\n', ' ')
			review_dict["reviewtext"] = clean_reviewtext
	
			all_dictionaries.append(review_dict)
			
	return all_dictionaries

def xml_tree(new_dict): # should I change something here???
	
	root = ET.Element("countries")
	country = ET.SubElement(root, "country")
	
	ET.SubElement(country, "name").text = new_dict["country_name"]
	city = ET.SubElement(country, "city")
	
	ET.SubElement(city, "tripadvisorid").text = str(new_dict["tripadvisorid_city"])
	ET.SubElement(city, "name").text = new_dict["city_name"]
	hotels = ET.SubElement(city, "hotels")
	
	hotel = ET.SubElement(hotels, "hotel")
	ET.SubElement(hotel, "tripadvisorid").text = str(new_dict["tripadvisorid_hotel"])
	ET.SubElement(hotel, "name").text = new_dict["hotel_name"]
	reviews = ET.SubElement(hotel, "reviews")
	
	review = ET.SubElement(reviews, "review")
	ET.SubElement(review, "reviewid").text = str(new_dict["reviewid"])
	ET.SubElement(review, "reviewurl").text = new_dict["reviewurl"]
	ET.SubElement(review, "reviewlang").text = new_dict["reviewlang"]
	ET.SubElement(review, "reviewtitle").text = new_dict["reviewtitle"]
	ET.SubElement(review, "reviewtext").text = new_dict["reviewtext"]
	
	tree = ET.ElementTree(root)
	tree.write("test.xml", encoding='unicode')	
	
	print(ET.tostring(root, encoding='utf8').decode('utf8'))
	
##########################################################	

def main():

	city_dict = get_countrycitydata()
	hotel_dict = get_hoteldata()
	review_list = get_reviews()

	for index in range(len(review_list)):
		new_dict = {**city_dict, **hotel_dict, **review_list[index]}
	
		xml_tree(new_dict)

if __name__ == "__main__":
	main()	

How can I change the XML tree so that all five reviews are saved in the file? The XML file should look like this:

<countries>
	<country>
		<name>Schweiz</name>
		<city>
			<tripadvisorid>188113</tripadvisorid>
			<name>Zürich</name>
			<hotels>
				<hotel>
					<tripadvisorid>228146</tripadvisorid>
					<name>Hotel Coronado</name>
					<reviews>
						<review>
							<reviewid>672052111</reviewid> 
							<reviewurl>https://www.tripadvisor.ch/ShowUserReviews-g188113-d228146-r672052111-Coronado Hotel-Zurich.html</reviewurl>
							<reviewlang>de</reviewlang>
							<reviewtitle>Optimale Lage und Preis</reviewtitle>
							<reviewtext>Hervorragendes Hotel.Beste Erfahrun mit Service und Zimme.Die Qalität der Betten ist optimalr. Zimmer sind trotz geringer Größe sehr gut ausgestattet.Der Föhn war in diesem Fall (nicht in früheren)etwas lahm</reviewtext>
						</review>
                        <review>
                         second review here ...
                        </review>
                        <review>
                         third review here ...
                        </review>
                        ...
					</reviews>
				</hotel>
			</hotels>
		</city>
	</country>
</countries>

Thank you in advance for all suggestions!

答案1

得分: 2

因为你的 xml_tree(new_dict) 存在于一个 for 循环内,tree.write() 方法被多次调用,覆盖了你的文件。

open() 中以 a(追加)模式打开你的文件:

tree.write(open('test.xml', 'a'), encoding='unicode')

请查看文档 此处

英文:

Because your xml_tree(new_dict) exists inside of a for loop, the tree.write() method is being called multiple times overwriting your file.

Open your file in a (append) mode with open():

tree.write(open('test.xml', 'a'), encoding='unicode')

See documentation here

huangapple
  • 本文由 发表于 2020年1月6日 22:25:02
  • 转载请务必保留本文链接:https://go.coder-hub.com/59613778.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定