2023年2月24日 05:33:30go评论71阅读模式

英文:

Get src from img tag with puppeteer

问题

return {
    src: e.querySelector('.fc-item__media-wrapper .responsive-img').getAttribute('src'),
    image: text('.fc-item__media-wrapper .responsive-img'),
};

英文:

I want to get the link text from the src attribute within an img tag. This is part of the html with the img and src tags:

&lt;img alt=&quot;&quot; class=&quot;responsive-img&quot; src=&quot;https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=300&amp;amp;quality=85&amp;amp;auto=format&amp;amp;fit=max&amp;amp;s=787aa5ddd44a8a66a06120452e228503&quot;&gt;

I will give the HTML and the script I use because that is what I tried:

This is the HTML:

&lt;div class=&quot;fc-item__container&quot;&gt;
    &lt;div class=&quot;fc-item__media-wrapper&quot;&gt;
        &lt;div class=&quot;fc-item__image-container u-responsive-ratio&quot;&gt;
            &lt;picture&gt;&lt;!--[if IE 9]&gt;&lt;video style=&quot;display: none;&quot;&gt;&lt;![endif]--&gt;
                &lt;source
                    media=&quot;(min-width: 980px) and (-webkit-min-device-pixel-ratio: 1.25), (min-width: 980px) and (min-resolution: 120dpi)&quot;
                    srcset=&quot;https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&amp;amp;quality=45&amp;amp;auto=format&amp;amp;fit=max&amp;amp;dpr=2&amp;amp;s=35c7a9a7cc4e5ebd8fcdfcb67177a8f4 280w&quot;&gt;
                &lt;source media=&quot;(min-width: 980px)&quot;
                    srcset=&quot;https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&amp;amp;quality=85&amp;amp;auto=format&amp;amp;fit=max&amp;amp;s=f68f029ce1b60ed96581f28a29062e3b 140w&quot;&gt;
                &lt;source
                    media=&quot;(min-width: 740px) and (-webkit-min-device-pixel-ratio: 1.25), (min-width: 740px) and (min-resolution: 120dpi)&quot;
                    srcset=&quot;https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&amp;amp;quality=45&amp;amp;auto=format&amp;amp;fit=max&amp;amp;dpr=2&amp;amp;s=35c7a9a7cc4e5ebd8fcdfcb67177a8f4 280w&quot;&gt;
                &lt;source media=&quot;(min-width: 740px)&quot;
                    srcset=&quot;https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=140&amp;amp;quality=85&amp;amp;auto=format&amp;amp;fit=max&amp;amp;s=f68f029ce1b60ed96581f28a29062e3b 140w&quot;&gt;
                &lt;!--[if IE 9]&gt;&lt;/video&gt;&lt;![endif]--&gt;
                &lt;img alt=&quot;&quot; class=&quot;responsive-img&quot;
                    src=&quot;https://i.guim.co.uk/img/media/6167380a1330877b8265353f2756b127c2226824/0_81_4256_2554/master/4256.jpg?width=300&amp;amp;quality=85&amp;amp;auto=format&amp;amp;fit=max&amp;amp;s=787aa5ddd44a8a66a06120452e228503&quot;&gt;
            &lt;/picture&gt;
        &lt;/div&gt;
    &lt;/div&gt;
&lt;/div&gt;

This is the Puppeteer script:

const fs = require(&quot;node:fs/promises&quot;);
const puppeteer = require(&quot;puppeteer&quot;); // ^19.4.1

const url = &quot;https://www.theguardian.com/international&quot;;

let browser;
(async () =&gt; {
    browser = await puppeteer.launch();
    const [page] = await browser.pages();
    await page.setJavaScriptEnabled(false);
    await page.setRequestInterception(true);
    page.on(&quot;request&quot;, req =&gt; {
        if (req.url() !== url) {
            req.abort();
        }
        else {
            req.continue();
        }
    });
    await page.goto(url, { waitUntil: &quot;domcontentloaded&quot; });
    const img_src = await page.$$eval(&quot;.fc-item__container&quot;, els =&gt;
        els.map(e =&gt; {
            const text = s =&gt; e.querySelector(s)?.textContent.trim();
            return {
                src: e.querySelector(&quot;.fc-item__media-wrapper .responsive-img src&quot;),
                image: text(&quot;.fc-item__media-wrapper .responsive-img&quot;),
            };
        })
    );
    console.log(img_src);
    await fs.writeFile(&quot;img_src.json&quot;, JSON.stringify(img_src, null, 2));
})()
    .catch(err =&gt; console.error(err))
    .finally(() =&gt; browser?.close());

The script runs but all I get are empty strings, like this:

[
  {
    &quot;src&quot;: null,
    &quot;image&quot;: &quot;&quot;
  },
  {
    &quot;src&quot;: null,
    &quot;image&quot;: &quot;&quot;
  }
}
]

As you can see I tried 2 variations but both doesn't give any result.

return {
    src: e.querySelector(&quot;.fc-item__media-wrapper .responsive-img src&quot;),
    image: text(&quot;.fc-item__media-wrapper .responsive-img&quot;),
};

Any help is much appriciated.

答案1

得分: 1

首先，关于阻止请求和禁用 JS，你做得很棒！这显著加速了脚本的执行，意味着我们可以纯粹依赖于 view-source:，从而简化了事情。

问题在于：

e.querySelector(".fc-item__media-wrapper .responsive-img src"),

这句话的意思是“返回具有 class="responsive-img" 的元素内的 <src> 标记，该元素位于具有 class="fc-item__media-wrapper" 的元素内”。你可能是想要：

e.querySelector(".fc-item__media-wrapper .responsive-img")
  ?.getAttribute("src")

至于“text”，我不确定它指的是什么，因为在 .fc-item__media-wrapper 类内部似乎没有文本。

如果你正在寻找“kicker text” 或“headline”，这是一种方法：

const fs = require("node:fs/promises");
const puppeteer = require("puppeteer"); // ^19.6.3

const url = "<Your URL>";

let browser;
(async () => {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  await page.setJavaScriptEnabled(false);
  await page.setRequestInterception(true);
  page.on("request", req => {
    if (req.url() !== url) {
      req.abort();
    }
    else {
      req.continue();
    }
  });
  await page.goto(url, {waitUntil: "domcontentloaded"});
  const data = await page.$$eval(".fc-item__container", els =>
    els.map(e => {
      const $ = s => e.querySelector(s);
      const text = s => $(s)?.textContent.trim();
      return {
        src: $(".fc-item__media-wrapper .responsive-img")
               ?.getAttribute("src"),
        kicker: text(".fc-item__kicker"),
        headline: text(".fc-item__headline"),
      };
    })
  );
  console.log(data);
  await fs.writeFile("img_src.json", JSON.stringify(data, null, 2));
})()
  .catch(err => console.error(err))
  .finally(() => browser?.close());

顺便提一下，一旦你已经阻止了所有请求并禁用了 JS，通常可以使用 fetch（或者如果你尚未使用 Node 18，则使用 axios）与 Cheerio。这将简化事情并进一步提速：

const cheerio = require("cheerio"); // 1.0.0-rc.12

const url = "<Your URL>";

fetch(url)
  .then(res => {
    if (!res.ok) {
      throw Error(res.statusText);
    }

    return res.text();
  })
  .then(html => {
    const $ = cheerio.load(html);
    const data = [...$(".fc-item__container")].map(e => ({
      src: $(e).find(".fc-item__media-wrapper .responsive-img").attr("src"),
      kicker: $(e).find(".fc-item__kicker").text().trim(),
      headline: $(e).find(".fc-item__headline").text().trim(),
    }));
    console.log(data);
  })
  .catch(err => console.error(err));

另请参阅提问者的相关问题。

英文:

First of all, great job on blocking requests and disabling JS! This speeds up the script considerably and means we can rely purely on the view-source: which simplifies matters.

A problem is:

e.querySelector(&quot;.fc-item__media-wrapper .responsive-img src&quot;),

This says "return the <src> tag within an element with class="responsive-img" within an element with class="fc-item__media-wrapper"". You probably mean:

e.querySelector(&quot;.fc-item__media-wrapper .responsive-img&quot;)
  ?.getAttribute(&quot;src&quot;)

As for the "text", I'm not sure what that refers to, since there's no text anywhere inside of the .fc-item__media-wrapper class.

If you're looking for the kicker text or headline, here's one approach:

const fs = require(&quot;node:fs/promises&quot;);
const puppeteer = require(&quot;puppeteer&quot;); // ^19.6.3

const url = &quot;&lt;Your URL&gt;&quot;;

let browser;
(async () =&gt; {
  browser = await puppeteer.launch();
  const [page] = await browser.pages();
  await page.setJavaScriptEnabled(false);
  await page.setRequestInterception(true);
  page.on(&quot;request&quot;, req =&gt; {
    if (req.url() !== url) {
      req.abort();
    }
    else {
      req.continue();
    }
  });
  await page.goto(url, {waitUntil: &quot;domcontentloaded&quot;});
  const data = await page.$$eval(&quot;.fc-item__container&quot;, els =&gt;
    els.map(e =&gt; {
      const $ = s =&gt; e.querySelector(s);
      const text = s =&gt; $(s)?.textContent.trim();
      return {
        src: $(&quot;.fc-item__media-wrapper .responsive-img&quot;)
               ?.getAttribute(&quot;src&quot;),
        kicker: text(&quot;.fc-item__kicker&quot;),
        headline: text(&quot;.fc-item__headline&quot;),
      };
    })
  );
  console.log(data);
  await fs.writeFile(&quot;img_src.json&quot;, JSON.stringify(data, null, 2));
})()
  .catch(err =&gt; console.error(err))
  .finally(() =&gt; browser?.close());

By the way, once you've gotten to the point where you're blocking all requests and have disabled JS, you can often just use fetch (or axios, if you're not on Node 18 yet) with Cheerio. This simplifies matters and further speeds things up:

const cheerio = require(&quot;cheerio&quot;); // 1.0.0-rc.12

const url = &quot;&lt;Your URL&gt;&quot;;

fetch(url)
  .then(res =&gt; {
    if (!res.ok) {
      throw Error(res.statusText);
    }

    return res.text();
  })
  .then(html =&gt; {
    const $ = cheerio.load(html);
    const data = [...$(&quot;.fc-item__container&quot;)].map(e =&gt; ({
      src: $(e).find(&quot;.fc-item__media-wrapper .responsive-img&quot;).attr(&quot;src&quot;),
      kicker: $(e).find(&quot;.fc-item__kicker&quot;).text().trim(),
      headline: $(e).find(&quot;.fc-item__headline&quot;).text().trim(),
    }));
    console.log(data);
  })
  .catch(err =&gt; console.error(err));

从图像标签中使用 Puppeteer 获取 src

问题

答案1

字符串分割以捕获图像网址

Supabase Auth UI：显示注册界面而不是登录界面？

React列出的对象在状态更改时卸载。

如何在webpack 5或其他方式中配置runtimeChunk与预编译文件路径。

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论