英文:
Python playwright unable to access elements
问题
我想要抓取位于<li>
元素中的单词。结果返回了一个空列表。它们是否位于一个框架内?因为我看到它们不在任何<iframe><\iframe>
元素中。如果它们确实在框架内,那么在这种情况下,如何访问框架或找到框架的ID?这是网站和代码
from playwright.sync_api import sync_playwright, expect
def test_fetch_paperrater():
path = r"https://www.paperrater.com/page/lists-of-adjectives"
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
texts = page.locator("div#header-container article.page ul li").all_inner_texts()
print(texts)
browser.close()
英文:
I want to scrape the words which reside in the <li>
elements. The results return an empty list. Are they resided within a frame because as I can see they are not within any <iframe><\iframe>
elements? If they do how do you access the frame or find the frame id in this case? Here is the site and the code
from playwright.sync_api import sync_playwright, expect
def test_fetch_paperrater():
path = r"https://www.paperrater.com/page/lists-of-adjectives"
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
texts = page.locator("div#header-container article.page ul li").all_inner_texts()
print(texts)
browser.close()
答案1
得分: 2
元素不在div#header-container
中,而在div#wrapper
中。有多个ul
元素,访问这些元素的最佳方法是使用nth()
,如下所示
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
words = []
for i in range(1, 22, 2):
all_texts = page.locator("div#wrapper article.page ul").nth(i).all_inner_texts()
texts = all_texts[0].split("\n")
for text in texts:
append = words.append(text)
browser.close()
英文:
The elements were not in div#header-container
but div#wrapper
. There were multiple ul
elements and the best way to access these was with nth()
as follows
with sync_playwright() as playwright:
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto(path)
words = []
for i in range(1, 22, 2):
all_texts = page.locator("div#wrapper article.page ul").nth(i).all_inner_texts()
texts = all_texts[0].split("\n")
for text in texts:
append = words.append(text)
browser.close()
答案2
得分: 0
页面上没有iframes:在浏览器控制台中运行document.querySelector("iframe")
会返回null。但这通常是调试定位失败时的一个好思路。
页面上有许多不同的列表,但没有一个在页眉中。article.page
也不在页眉中。我不确定你想要哪个列表,但以下代码将获取它们以及它们的标题:
from playwright.sync_api import sync_playwright # 1.23.1
url = "<Your URL>"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(java_script_enabled=False)
page = browser.new_page()
def handle_route(route):
route.continue_() if route.request.url == url else route.abort()
page.route("**/*", handle_route)
page.goto(url, wait_until="domcontentloaded")
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els =>
els.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
for lst in lists:
print(lst)
请注意,数据在HTML中静态可用,因此我已阻止了图像并禁用了JS。
一旦达到这一点,您甚至可能不需要使用Playwright。我们可以通过使用requests和BeautifulSoup来加速操作:
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
lists = []
for x in soup.select(".content h2.doc-h2"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")],
})
for lst in lists:
print(lst)
另一方面,如果您只想要一个列表:
(使用Playwright)
# ...
page.goto(url, wait_until="domcontentloaded")
lst = (
page.locator(".content h2.doc-h2", has_text="Appearance adjectives list")
.evaluate("""
el => [...el.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
""")
)
print(lst)
(使用BeautifulSoup)
import re
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
h = soup.find(text=re.compile(r"^\s*Appearance adjectives list\s*$"))
lst = [x.text.strip() for x in h.find_next("ul").select("li")]
print(lst)
如果您只想要形容词的列表,对于Playwright或BeautifulSoup来说,最简单的更改是使用切片来获取奇数索引并修剪尾部的“趋势”列表:
for lst in lists[0:-10:2]:
print(lst)
如果这看起来对特定页面太硬编码了,您可以通过过滤以“list”结尾的标题来筛选:
(使用Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.filter(e => e.textContent.trim().endsWith(" list"))
.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
(使用BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
if x.text.strip().endswith(" list"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")]
})
或者筛选非空列表,以及所有项目都正好是一个词:
(使用Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.map(e => ({
header: e.textContent.trim(),
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent.trim())
}))
.filter(e =>
e.items.length &&
e.items.every(e => e.split(/\s+/).length === 1)
)
""")
)
(使用BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
items = [x.text.strip() for x in x.find_next("ul").select("li")]
if items and all(len(x.split()) == 1 for x in items):
lists.append({
"header": x.text.strip(),
"items": items
})
英文:
There are no iframes on the page: document.querySelector("iframe")
returns null when run in the browser console. But that's often a good line of thought for debugging failing locators.
There are many different lists on the page, but none of them are inside the header. article.page
is also not in the header. I'm not sure which list you want, but this gets all of them along with their accompanying header:
from playwright.sync_api import sync_playwright # 1.23.1
url = "<Your URL>"
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(java_script_enabled=False)
page = browser.new_page()
def handle_route(route):
route.continue_() if route.request.url == url else route.abort()
page.route("**/*", handle_route)
page.goto(url, wait_until="domcontentloaded")
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els =>
els.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
for lst in lists:
print(lst)
Note that the data is available statically in the HTML, so I've blocked images and disabled JS.
Once you get to that point, you may not even need Playwright. We could speed things up by using requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
lists = []
for x in soup.select(".content h2.doc-h2"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")],
})
for lst in lists:
print(lst)
On the other hand, if you just want one list:
(Playwright)
# ...
page.goto(url, wait_until="domcontentloaded")
lst = (
page.locator(".content h2.doc-h2", has_text="Appearance adjectives list")
.evaluate("""
el => [...el.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
""")
)
print(lst)
(BeautifulSoup)
import re
import requests
from bs4 import BeautifulSoup
url = "<Your URL>"
res = requests.get(url)
res.raise_for_status()
soup = BeautifulSoup(res.text, "lxml")
h = soup.find(text=re.compile(r"^\s*Appearance adjectives list\s*$"))
lst = [x.text.strip() for x in h.find_next("ul").select("li")]
print(lst)
If you want just the lists of adjectives, the shortest change for either Playwright or BeautifulSoup is to use a slice to grab odd indexes and trim the "trends" lists at the tail:
for lst in lists[0:-10:2]:
print(lst)
If this seems too hardcoded to the particular page, you could filter by headers that end with " list"
:
(Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.filter(e => e.textContent.trim().endsWith(" list"))
.map(e => ({
header: e.textContent,
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent)
}))
""")
)
(BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
if x.text.strip().endswith(" list"):
lists.append({
"header": x.text.strip(),
"items": [x.text.strip() for x in x.find_next("ul").select("li")]
})
Or filter for lists that are nonempty, and where all items are exactly one word:
(Playwright)
lists = (
page.locator(".content h2.doc-h2")
.evaluate_all("""
els => els
.map(e => ({
header: e.textContent.trim(),
items: [...e.nextElementSibling.querySelectorAll("li")]
.map(e => e.textContent.trim())
}))
.filter(e =>
e.items.length &&
e.items.every(e => e.split(/\s+/).length === 1)
)
""")
)
(BeautifulSoup)
for x in soup.select(".content h2.doc-h2"):
items = [x.text.strip() for x in x.find_next("ul").select("li")]
if items and all(len(x.split()) == 1 for x in items):
lists.append({
"header": x.text.strip(),
"items": items
})
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论