如何找到页面上的所有字符串?

huangapple go评论58阅读模式
英文:

How to find all strings on a page?

问题

I almost managed to do what I want, but there is a small flaw.

I have this HTML

<body>
  <div>
    <div>div</div>
  </div>

  <h1>
    <h2>
      <p>p1</p>

      <p>
        <p>p2</p>
      </p>
    </h2>

    <h3>
      <h2>h2</h2>
      <h2>h2</h2>
    </h3>
  </h1>

  <span>span</span>
  <h6>
    <h6>h6</h6>
  </h6>
</body>

And my last attempt gives me almost the array I want

var elements = Array.from(document.body.getElementsByTagName("*"));
var newStrings = [];

for (var i = 0; i < elements.length; i++) {
  const el = elements[i];
  if (el.innerText.length !== 0) {
    newStrings.push(el.innerText);
  }
}

console.log(newStrings); //  ['div', 'div', 'p1\n\np2', 'p1', 'p2', 'h2', 'h2', 'span', 'h6']

but as a result I need

['div', 'p1', 'p2', 'h2', 'h2', 'span', 'h6']
英文:

I almost managed to do what I want, but there is a small flaw.

I have this HTML

<body>
  <div>
    <div>div</div>
  </div>

  <h1>
    <h2>
      <p>p1</p>

      <p>
        <p>p2</p>
      </p>
    </h2>

    <h3>
      <h2>h2</h2>
      <h2>h2</h2>
    </h3>
  </h1>

  <span>span</span>
  <h6>
    <h6>h6</h6>
  </h6>
</body>

And my last attempt gives me almost the array I want

var elements = Array.from(document.body.getElementsByTagName("*"));
var newStrings = [];

for (var i = 0; i < elements.length; i++) {
  const el = elements[i];
  if (el.innerText.length !== 0) {
    newStrings.push(el.innerText);
  }
}

console.log(newStrings); //  ['div', 'div', 'p1\n\np2', 'p1', 'p2', 'h2', 'h2', 'span', 'h6']

but as a result I need
['div', 'p1', 'p2', 'h2', 'h2', 'span', 'h6']

I will be very grateful for your help!

答案1

得分: 2

获取页面上所有字符串的最佳方法是选择页面中的所有文本节点,然后获取每个文本节点的文本内容(这样,您可以避免在选择父元素和子元素的innerText时出现重复字符串的情况)。

以下是一种选择页面中所有文本节点的方法(改编自https://stackoverflow.com/a/10730777/19461620):

const textNodes = [];
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, null, false);
let n;
while (n = walker.nextNode()) textNodes.push(n);
const newStrings = textNodes.map(textNode => textNode.textContent).filter(text => text.trim() !== '')
console.log(newStrings) // 输出: ['div', 'p1', 'p2', 'h2', 'h2', 'span', 'h6']
英文:

The best way to get all the strings on the page is to select all text nodes in the page and then get the text content of each (this way, you avoid getting duplicate strings in cases where you select the innerText of both the parent and child).

Here is one way to select all the text nodes in a page (adapted from https://stackoverflow.com/a/10730777/19461620):

const textNodes = [];
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, null, false);
let n;
while (n = walker.nextNode()) textNodes.push(n);
const newStrings = textNodes.map(textNode => textNode.textContent).filter(text => text.trim() !== '')
console.log(newStrings) // outputs: ['div', 'p1', 'p2', 'h2', 'h2', 'span', 'h6']

答案2

得分: 1

请尝试这样做,您将获得所需的输出:

function getInnerText() {
    const elements = document.querySelectorAll("*");
  
    const innerTexts = [];
  
    for (let element of elements) {
      const innerText = element.innerText;
  
      if (innerText && innerText.length > 0 && innerText.trim().length > 0) {
        innerTexts.push(innerText);
      }
    }
  
    return innerTexts[0].split('\n').filter(function (el) {
        return el != "";
    });
}

const innerTexts = getInnerText();

console.log(innerTexts);
英文:

try this, you will get the desired output:

function getInnerText() {
    const elements = document.querySelectorAll("*");
  
    const innerTexts = [];
  
    for (let element of elements) {
      const innerText = element.innerText;
  
      if (innerText && innerText.length > 0 && innerText.trim().length > 0) {
        innerTexts.push(innerText);
      }
    }
  
    return innerTexts[0].split('\n').filter(function (el) {
        return el != "";
        });
  }

const innerTexts = getInnerText();

console.log(innerTexts);

答案3

得分: 1

首先,HTML 无效。<h1><h6>,以及 <p> 只能包含短语内容,其中不包括 <h1><h6>,或 <p>。在下面的示例中,HTML 已经被更正。

详细信息在示例中已注释

/**
 * 使用 nodeIterator 提取给定 DOM 元素的所有文本节点。
 * @param {string<selector>|Object<DOM>} tag - 要提取文本的 CSS 选择器或 DOM 对象元素。如果未传递任何内容或传递了无效内容,则默认为 document.body。
 * @returns {array} - 字符串数组
 */
function getText(tag = document.body) {
  /**
   * 如果传递了字符串,则使用 .querySelector() 进行引用。
   * 如果传递了有效的 DOM 对象,则引用它。
   * 否则使用默认值 document.body。
   */
  let root = typeof tag === "string" ? document.querySelector(tag) : tag;
  let result = [], current;
  /**
   * 创建一个 nodeIterator。
   * 有关详细信息,请参阅:
   * https://javascript.plainenglish.io/what-is-the-javascript-nodeiterator-api-c4443b79b492
   * @param {Object<DOM>} root - 从该节点开始提取文本。
   * @param {Object<NodeFilter>} whatToShow - 内置过滤器。
   * @param {function} filter - 自定义过滤器。
   * @returns {NodeList} - 一个类似数组的节点对象。
   */
  const itr = document.createNodeIterator(
    root, 
    NodeFilter.SHOW_TEXT, // 仅显示文本。
    (node) => {
      // 过滤掉 <script> 和 <style> 标签。
      if (node.parentElement.tagName === "SCRIPT" || node.parentElement.tagName === "STYLE") {
        return NodeFilter.FILTER_SKIP;
      }
      return NodeFilter.FILTER_ACCEPT;
    }
  );
  // 将每个文本节点添加到数组中
  while (current = itr.nextNode()) {
    result.push(current.nodeValue);
  }
  // 返回所有空格已被过滤掉的数组。
  return result.flatMap(node => node.trim() || []);
}
console.log(JSON.stringify(getText()));
<!-- 注释显示至少包含一个单词字符的每个文本节点的未过滤结果 -->
<div>
  <div>div</div> <!-- "div" &#128077;-->
</div>

<header>
  <h1>
    <i>h1 </i> <!-- "h1 " 文本后有空格 -->
  </h1>
  <h2>
    <u>h2​</u> <!-- "h2" 零宽度空格可能会妨碍匹配 -->
  </h2>
  <h3>
    <i>h3
    <!-- "h3\n    " 文本后有换行符和制表符 --> 
    </i>
    <q> h3 </q> <!-- " h3 " 文本前后有空格 -->
  </h3>
</header>

<span>  span</span> <!-- "  span" 文本前有制表符 -->
<h6>
  <u>
  h6</u> <!-- "\n  h6" 文本前有换行符和制表符 -->
</h6>
英文:

First off, the HTML is is invalid. &lt;h1&gt; to &lt;h6&gt;, and &lt;p&gt; may only contain phrasing content of which does NOT include &lt;h1&gt; to &lt;h6&gt;, or &lt;p&gt;. In the example below, the HTML has been corrected.

Details are commented in example

<!-- begin snippet: js hide: false console: true babel: false -->

<!-- language: lang-js -->

/**
 * Using a nodeIterator to extract all textNodes of a given DOM element.
 * @param {string&lt;selector&gt;|Object&lt;DOM&gt;} tag - Either a CSS selector
 *        or a DOM Object of an element to extract text from. If nothing or
 *        something invalid is passed, @default is document.body.
 * @returns {array} - An array of strings
 */
function getText(tag = document.body) {
  /**
   * If a string is passed, reference with .querySelector().
   * If a valid DOM Obkect is passed reference it.
   * If niether then use @default document.body.
   */
  let root = typeof tag === &quot;string&quot; ? document.querySelector(tag) : tag;
  let result = [], current;
  /**
   * Create a nodeIterator.
   * For details go to: 
   * https://javascript.plainenglish.io/what-is-the-javascript-nodeiterator-api-c4443b79b492
   * @param {Object&lt;DOM&gt;} root - Start extracting text from this node.
   * @param {Object&lt;NodeFilter&gt;} whatToShow - Built-in filter.
   * @param {function} filter - A custom filter.
   * @returns {NodeList} - An array-like object of nodes.
   */
  const itr = document.createNodeIterator(
    root, 
    NodeFilter.SHOW_TEXT, // Filters in text.
    (node) =&gt; {
      // Filter out &lt;script&gt; and &lt;style&gt; tags.
      if (node.parentElement.tagName === &quot;SCRIPT&quot; || node.parentElement.tagName === &quot;STYLE&quot;) {
        return NodeFilter.FILTER_SKIP;
      }
      return NodeFilter.FILTER_ACCEPT;
    }
  );
  // Add each textNode to array
  while (current = itr.nextNode()) {
    result.push(current.nodeValue);
  }
  // Return the array with all whitespaces filtered out.
  return result.flatMap(node =&gt; node.trim() || []);
}
console.log(JSON.stringify(getText()));

<!-- language: lang-html -->

&lt;!-- Comments show unfiltered results for each textNode with at least one
word charater --&gt;
&lt;div&gt;
  &lt;div&gt;div&lt;/div&gt; &lt;!-- &quot;div&quot; &#128077;--&gt;
&lt;/div&gt;

&lt;header&gt;
  &lt;h1&gt;
    &lt;i&gt;h1 &lt;/i&gt; &lt;!-- &quot;h1 &quot; space after text--&gt;
  &lt;/h1&gt;
  &lt;h2&gt;
    &lt;u&gt;h2​&lt;/u&gt; &lt;!-- &quot;h2&quot; zero width spaces can hinder matching --&gt;
  &lt;/h2&gt;
  &lt;h3&gt;
    &lt;i&gt;h3
    &lt;!-- &quot;h3\n    &quot; new line and tab after text--&gt; 
    &lt;/i&gt;
    &lt;q&gt; h3 &lt;/q&gt; &lt;!-- &quot; h3 &quot; space before and after text --&gt;
  &lt;/h3&gt;
&lt;/header&gt;

&lt;span&gt;  span&lt;/span&gt; &lt;!-- &quot;  span&quot; tab before text --&gt;
&lt;h6&gt;
  &lt;u&gt;
  h6&lt;/u&gt; &lt;!-- &quot;\n  h6&quot; new line and tab before text --&gt;
&lt;/h6&gt;

<!-- end snippet -->

huangapple
  • 本文由 发表于 2023年1月9日 06:12:25
  • 转载请务必保留本文链接:https://go.coder-hub.com/75051630.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定