Python – 使用BeautifulSoup从EML文件中提取URL

huangapple go评论68阅读模式
英文:

Python - How to Pull URLs From EML Files with BeautifulSoup

问题

Sure, here's the translated code portion:

我正在尝试读取一个EML文件然后提取其中的所有URL

我有两个方法body_to_text() 从EML中获取正文使用BytesParser或Soupfind_links() 获取正文并使用正则表达式查找URL

对于大多数样本我已经使它们正常工作了但是当使用Soup来解析非多部分文件时当样本包含行尾等号时我遇到了问题

```python
def body_to_text():
        with open("email.eml", "rb") as email_file:
            email_message = email.message_from_binary_file(email_file)

        if email_message.is_multipart():
            with open(self.email, 'rb') as fp:
                msg = BytesParser(policy=policy.default).parse(fp)

            try:
                body_text = msg.get_body(preferencelist=('plain')).get_content().strip()
            except AttributeError:
                print("No body found")
            else:
                body_text = body_text.replace("\n", "")
                
                if body_text == "":
                    print("No body found")
                else:
                    self.find_links(body_text)

        else:            
            body_html = email_message.get_payload()
            soup = BeautifulSoup(body_html, "lxml")
            find_links(soup)

def find_links(scan_text):
        WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()&lt;&gt;{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|

<details>
<summary>英文:</summary>

I&#39;m trying to read an EML file and then pull all URLs within it.

I&#39;ve got two methods: body_to_text() which gets the body from the EML, with either BytesParser or Soup; and find_links() which takes the body and uses a regex to find the URLs.

I&#39;ve got it working for most samples I&#39;ve tried, however when using Soup to parse the non-multipart files, I run into a problem when the sample contains end of line equals signs.

def body_to_text():
with open("email.eml", "rb") as email_file:
email_message = email.message_from_binary_file(email_file)

    if email_message.is_multipart():
with open(self.email, &#39;rb&#39;) as fp:
msg = BytesParser(policy=policy.default).parse(fp)
try:
body_text = msg.get_body(preferencelist=(&#39;plain&#39;)).get_content().strip()
except AttributeError:
print(&quot;No body found&quot;)
else:
body_text = body_text.replace(&quot;\n&quot;, &quot;&quot;)
if body_text == &quot;&quot;:
print(&quot;No body found&quot;)
else:
self.find_links(body_text)
else:            
body_html = email_message.get_payload()
soup = BeautifulSoup(body_html, &quot;lxml&quot;)
find_links(soup)

def find_links(scan_text):
WEB_URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.-]+./)(?:[^\s()<>{}[]]+|([^\s()]?([^\s()]+)[^\s()]?)|([^\s]+?))+(?:([^\s()]?([^\s()]+)[^\s()]?)|([^\s]+?)|[^\s`!()[]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.-][a-z0-9]+)*.\b/?(?!@)))"""
links = re.findall(WEB_URL_REGEX, str(scan_text))

    links = list(dict.fromkeys(self.links))
print(f&quot;{len(self.links)} links found&quot;)
print(links)

print(body_html) gives
&gt; ```
&gt; &lt;a href=3D&quot;http://fawper.xyz/corruptly/676197486/trout/gen=
&gt; eralizing/1683814388/upgather/disjoin&quot; style=3D&quot;-webkit-text-size-adjust:no=
&gt; ne;text-decoration:none;&quot;&gt; &lt;font style=3D&quot;-webkit-text-size-adjust:none;fon=
&gt; t-size:15px;
&gt; ```
And print(soup) gives
&gt; ```
&gt; href=&#39;3D&quot;http://fawper.xyz/corruptly/676197486/trout/gen=&#39; ne=&quot;&quot; style=&#39;3D&quot;-webkit-text-size-adjust:no=&#39;&gt; &lt;font style=&#39;3D&quot;-webkit-text-size-adjust:none;fon=&#39; t-size:15px=&quot;&quot;
&gt; ```
So then find_links outputs:
&gt; ```
&gt; &#39;http://fawper.xyz/corruptly/676197486/trout/gen=&#39;
&gt; ```
When I want it to output:
&gt; ```
&gt; &#39;http://fawper.xyz/corruptly/676197486/trout/generalizing/1683814388/upgather/disjoin&#39;
&gt; ```
I&#39;ve tried using html.parser and html5lib in place of lxml, but that didn&#39;t solve it. Could it be the encoding of the specific email that I&#39;m parsing?
</details>
# 答案1
**得分**: 1
以下是您要翻译的内容:
将 soup 块与[lastchancexi的答案](https://stackoverflow.com/a/71416428/21936606)的一部分进行交换,该答案使用 Email 模块根据内容类型获取其有效载荷,给了我所期望的输出。
```python
def body_to_text(self):
text = ""
html = ""
with open(self.email, "rb") as email_file:
email_message = email.message_from_binary_file(email_file)
if not email_message.is_multipart():
content_type = email_message.get_content_type()
if content_type == "text/plain":
text += str(email_message.get_payload(decode=True))
self.find_urls(text)
elif content_type == "text/html":
html += str(email_message.get_payload(decode=True))
self.find_urls(html)
else:
with open(self.email, 'rb') as fp:
msg = BytesParser(policy=policy.default).parse(fp)
try:
body_text = msg.get_body(preferencelist=('plain',)).get_content().strip()
except AttributeError:
print("No body found")
else:
body_text = body_text.replace("\n", "")
if body_text == "":
print("No body found")
else:
self.find_urls(body_text)
英文:

Swapping the soup block with a part of lastchancexi's answer, which used the Email module to get its payload based on the content type, gave me the desired output.

def body_to_text(self):
text = &quot;&quot;
html = &quot;&quot;
with open(self.email, &quot;rb&quot;) as email_file:
email_message = email.message_from_binary_file(email_file)
if not email_message.is_multipart():
content_type = email_message.get_content_type()
if content_type == &quot;text/plain&quot;:
text += str(email_message.get_payload(decode=True))
self.find_urls(text)
elif content_type == &quot;text/html&quot;:
html += str(email_message.get_payload(decode=True))
self.find_urls(html)
else:
with open(self.email, &#39;rb&#39;) as fp:
msg = BytesParser(policy=policy.default).parse(fp)
try:
body_text = msg.get_body(preferencelist=(&#39;plain&#39;)).get_content().strip()
except AttributeError:
print(&quot;No body found&quot;)
else:
body_text = body_text.replace(&quot;\n&quot;, &quot;&quot;)
if body_text == &quot;&quot;:
print(&quot;No body found&quot;)
else:
self.find_urls(body_text)

huangapple
  • 本文由 发表于 2023年5月22日 05:40:29
  • 转载请务必保留本文链接:https://go.coder-hub.com/76302036.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定