英文:
Highlight python-docx with regex and spacy
问题
I want to highlight regex pattern in docx files in a folder using python-docx. I am able to achieve it through the normal regex code below.
Issue comes when I want to achieve the same through spacy nlp.
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
import pandas as pd
import os
import re
import spacy
nlp = spacy.load("en_core_web_sm")
path = r"/home/coder/Documents/"
doc1 = Document('test.docx')
doc = nlp(doc1)
# re_highlight = re.compile(r"[1-9][0-9]*|0") # This one works.
re_highlight = [token for token in doc if token.like_num == "TRUE"]
for filename in os.listdir(path):
if filename.endswith(".docx"):
file = "/home/writer/Documents/" + filename
print(file)
for para in doc.paragraphs:
text = para.text
if len(re_highlight.findall(text)) > 0:
matches = re_highlight.finditer(text)
para.text = ''
p3 = 0
for match in matches:
p1 = p3
p2, p3 = match.span()
para.add_run(text[p1:p2])
run = para.add_run(text[p2:p3])
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
para.add_run(text[p3:])
doc.save(file)
Error:
raise ValueError(Errors.E1041.format(type=type(doc_like)))
ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'docx.document.Document'>
I realize the doc doesn't have doc.paragraphs being nlp element. How to sort this problem?
Kindly help.
英文:
I want to highlight regex pattern in docx files in a folder using python-docx. I am able to achieve it through the normal regex code below.
Issue comes when I want to achieve the same through spacy nlp.
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
import pandas as pd
import os
import re
import spacy
nlp = spacy.load("en_core_web_sm")
path = r"/home/coder/Documents/"
doc1 = Document('test.docx')
doc = nlp(doc1)
#re_highlight = re.compile(r"[1-9][0-9]*|0") # This one works.
re_highlight = [token for token in doc if tok.like_num == "TRUE"]
for filename in os.listdir(path):
if filename.endswith(".docx"):
file = "/home/writer/Documents/" + filename
print(file)
for para in doc.paragraphs:
text = para.text
if len(re_highlight.findall(text)) > 0:
matches = re_highlight.finditer(text)
para.text = ''
p3 = 0
for match in matches:
p1 = p3
p2, p3 = match.span()
para.add_run(text[p1:p2])
run = para.add_run(text[p2:p3])
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
para.add_run(text[p3:])
doc.save(file)
Error:
>raise ValueError(Errors.E1041.format(type=type(doc_like)))
ValueError: [E1041] Expected a string, Doc, or bytes as input, but got: <class 'docx.document.Document'>
I realize the doc doesn't have doc.paragraphs being nlp element. How to sort this problem?
Kindly help.
答案1
得分: 1
你不能对 doc1
使用 nlp(doc1)
,因为 doc1
是一个 Document
对象,你需要提取文本部分并对其进行操作。我建议尝试类似以下的方法(在示例文件中有效):
import re
from pathlib import Path
import spacy
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
nlp = spacy.load("en_core_web_sm")
def highlight(text):
tokens = (token.text for token in nlp(text) if token.like_num)
return re.compile("|".join(sorted(tokens, key=len, reverse=True))
path_in = Path("/home/coder/Documents/") # 输入文件夹
path_out = Path("/home/writer/Documents/") # 输出文件夹
for file in path_in.glob("*.docx"):
print(f"处理文件 '{file}' ... ", end="")
doc = Document(file)
for para in doc.paragraphs:
text = para.text
para.text = ""
p3 = 0
for match in highlight(text).finditer(text):
p1 = p3
p2, p3 = match.span()
para.add_run(text[p1:p2])
run = para.add_run(text[p2:p3])
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
para.add_run(text[p3:])
doc.save(path_out / file.name)
print("完成。")
存在意外高亮的可能性。如果发生这种情况,你可以尝试使用以下方法:
def highlight(text):
tokens = (token.text for token in nlp(text) if token.like_num)
pat = r"\b(?:" + "|".join(sorted(tokens, key=len, reverse=True)) + r")\b"
return re.compile(pat)
希望对你有所帮助。
英文:
You can't do nlp(doc1)
with doc1
being a Document
object, you have to extract the text parts and work with them. I'd suggest something like the following instead (worked here for a sample file):
import re
from pathlib import Path
import spacy
from docx import Document
from docx.enum.text import WD_COLOR_INDEX
nlp = spacy.load("en_core_web_sm")
def highlight(text):
tokens = (token.text for token in nlp(text) if token.like_num)
return re.compile("|".join(sorted(tokens, key=len, reverse=True)))
path_in = Path("/home/coder/Documents/") # Input folder
path_out = Path("/home/writer/Documents/") # Output folder
for file in path_in.glob("*.docx"):
print(f"Processing file '{file}' ... ", end="")
doc = Document(file)
for para in doc.paragraphs:
text = para.text
para.text = ""
p3 = 0
for match in highlight(text).finditer(text):
p1 = p3
p2, p3 = match.span()
para.add_run(text[p1:p2])
run = para.add_run(text[p2:p3])
run.font.highlight_color = WD_COLOR_INDEX.YELLOW
para.add_run(text[p3:])
doc.save(path_out / file.name)
print("done.")
There's a chance of accidental highlighting. If that happens, you could try to use
def highlight(text):
tokens = (token.text for token in nlp(text) if token.like_num)
pat = r"\b(?:" + "|".join(sorted(tokens, key=len, reverse=True)) + r")\b"
return re.compile(pat)
instead.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论