2023年2月27日 11:56:06go评论97阅读模式

英文:

How to scrape multiple web-pages that will translate them from English to Hindi using python?

问题

I am struggling with the small issue, the code works and no errors. But I need to figure out how to translate multiple pages from the website. From English to Hindi and each pages has to be Hindi, so far I only translated one specific text from the main website.

#Script scraps the website using request and beautifulSoup library
from google_translate import browser
from google_translate import selenium
import requests
from bs4 import BeautifulSoup
URL = "https://www.classcentral.com/?"
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
# Here the user agent is for Edge browser on windows 10. You can find your browser user agent from the above given link.
r = requests.get(url=URL, headers=headers)
print(r.content)
# Parsing the HTML
soup = BeautifulSoup(r.content, 'html.parser')
# find all the anchor tags with "href"
for link in soup.find_all('a'):
    print(link.get('href'))

#Script transalate text into Hindi using google translate API
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium
# Give Language code in which you want to translate the text:=>
lang_code = 'hi '
# Provide text that you want to translate:=>
input1 = " Find your next course.Class Central aggregates courses from many providers to help you find the best courses on almost any subject, wherever they exist"
# launch browser with selenium:=>
browser = webdriver.Chrome() #browser = webdriver.Chrome('path of chromedriver.exe file') if the chromedriver.exe is in different folder
# copy google Translator link here:=>
browser.get("https://translate.google.co.in/?sl=auto&tl="+lang_code+"&text="+input1+"&op=translate")
# just wait for some time for translating input text:=>
time.sleep(6)
# Given below x path contains the translated output that we are storing in output variable:=>
output1 = browser.find_element(By.CLASS_NAME,'HwtZe').text
# Display the output:=>
print("Translated Paragraph=> " + output1)

英文:

#Script scraps the website using request and beautifulSoup library
from google_translate import browser
from google_translate import selenium
import requests
from bs4 import BeautifulSoup
URL = &quot;https://www.classcentral.com/?&quot;
headers = {&#39;User-Agent&#39;: &quot;Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246&quot;}
# Here the user agent is for Edge browser on windows 10. You can find your browser user agent from the above given link.
r = requests.get(url=URL, headers=headers)
print(r.content)
# Parsing the HTML
soup = BeautifulSoup(r.content, &#39;html.parser&#39;)
# find all the anchor tags with &quot;href&quot;
for link in soup.find_all(&#39;a&#39;):
    print(link.get(&#39;href&#39;))

#Script transalate text into Hindi using google translate API
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium
# Give Language code in which you want to translate the text:=&gt;
lang_code = &#39;hi &#39;
# Provide text that you want to translate:=&gt;
input1 = &quot; Find your next course.Class Central aggregates courses from many providers to help you find the best courses on almost any subject, wherever they exist&quot;
# launch browser with selenium:=&gt;
browser = webdriver.Chrome() #browser = webdriver.Chrome(&#39;path of chromedriver.exe file&#39;) if the chromedriver.exe is in different folder
# copy google Translator link here:=&gt;
browser.get(&quot;https://translate.google.co.in/?sl=auto&amp;tl=&quot;+lang_code+&quot;&amp;text=&quot;+input1+&quot;&amp;op=translate&quot;)
# just wait for some time for translating input text:=&gt;
time.sleep(6)
# Given below x path contains the translated output that we are storing in output variable:=&gt;
output1 = browser.find_element(By.CLASS_NAME,&#39;HwtZe&#39;).text
# Display the output:=&gt;
print(&quot;Translated Paragraph:=&gt; &quot; + output1)

答案1

得分: 1

Google翻译存在一些限制。根据我的理解，您无法在单个请求中翻译所有字符。因此，我建议您将文本拆分成多个请求进行翻译。

在下面的代码中，我使用了googletrans模块，在从网站获取文本后将其翻译成印地语。作为替代方案，您可以尝试以下代码，希望对您有所帮助：

import requests
from bs4 import BeautifulSoup
from googletrans import Translator
translator = Translator()
def scrape_web_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    text = soup.get_text()
    return text
def language_translator(urls):
    count = 1
    for url in urls:
        new_text = ""
        new_text = str(f"从页面 {count}")
        print(f'.........................从页面 {count} ...........................................')
        text = scrape_web_page(url)
        k = text.split()
        for i in k:
            # print(i)
            translated_text = translator.translate(i, dest='hi')
            new_text = new_text + " " + str(translated_text.text)
            # print(translated_text.text)
        count = count + 1
        print(new_text)
urls = [
    'https://demo1/page1', 'https://demo1/page2'
]
language_translator(urls)

注意：网站抓取涉及一些版权问题。

英文:

Google translate has some limitations. Based on my understanding you can’t translate all the characters in a single request. So I recommend you to translate the text in multiple requests.

In the below code, I am using the googletrans module and after fetching text from the website I am translating them into Hindi. As an alternative you can try below code,I hope this will helpful for you:

import requests
from bs4 import BeautifulSoup
from googletrans import Translator
translator = Translator()
def scrape_web_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, &#39;html.parser&#39;)
    text = soup.get_text()
    return text
def language_translator(url):
    count=1
    for url in urls:
        newtest=&quot;&quot;
        newtest=str(f&quot;From page {count}&quot;)
        print(f&#39;.........................from page {count} ...........................................&#39;)
        text = scrape_web_page(url)
        k=text.split()
        for i in k:
            #print(i)
            translated_text = translator.translate(i, dest=&#39;hi&#39;)
            newtest=newtest+&quot; &quot;+str(translated_text.text)
            #print(translated_text.text)
        count=count+1
        print(newtest)
urls = [
    &#39;https://demo1/page1&#39;,&#39;https://demo1/page2&#39;
    
]
language_translator(urls)

NB: There are some copyright issues involved in website scraping.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

如何使用Python爬取多个网页并将它们从英语翻译成印地语？

问题

答案1

如何从相关的Django模型中获取字段ID

在Python中执行操作以创建多个子列表

如何在Python中使用while循环迭代的所有值创建一个Excel文件？

Is there a way to specifically web scrape and get the data of heights that is not listed in text?

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。