2023年7月6日 21:46:39go评论120阅读模式

英文:

Python Script Optimization searching for SQL in many files

问题

import os
import re
import shutil
current_dir_location = os.getcwd()
labels_script_location = os.path.join(current_dir_location, 'SQL-scripts/labels.sql')
copy_location = os.path.join(current_dir_location, 'SQL-scripts/lablesCopy.sql')
# 创建 labels.sql 文件的副本
shutil.copy2(labels_script_location, copy_location)
# 从副本文件中读取插入语句
with open(copy_location, 'r', encoding='UTF-8') as file:
    insert_statements_from_labels = file.readlines()
# 编译正则表达式模式
text_id_pattern = re.compile(r"'[^']+'\s*,\s*'([^']+)'\s*,")  # 用于查找文本ID值的正则表达式
def search_references(references_dir, text_id):
    regex = re.compile(text_id)
    for root, dirs, files in os.walk(references_dir):
        if 'node_modules' in dirs:
            dirs.remove('node_modules')
        if 'SQL-scripts' in dirs:
            dirs.remove('SQL-scripts')
        for file_name in files:
            file_path = os.path.join(root, file_name)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    if regex.search(content):
                        return True  # 一旦找到匹配项，停止搜索
            except (UnicodeDecodeError, PermissionError):
                # 有些文件不是UTF-8编码的
                # 有些文件无法读取，如read.lock
                continue
    return False
def get_text_id(insert_statement):
    match = text_id_pattern.search(insert_statement)
    if match:
        text_id = match.group(1)  # 获取第二个捕获组
    else:
        text_id = None
    return text_id
def search_decide():
    lines_deleted = 0
    used_insert_statements = []
    unused_insert_statement = {}
    for index, insert_statement in enumerate(insert_statements_from_labels):
        text_id = get_text_id(insert_statement)
        if text_id:
            is_used = search_references(current_dir_location, text_id)
            if is_used:
                used_insert_statements.append(insert_statement)
            else:
                unused_insert_statement[index] = insert_statement
                lines_deleted += 1
    with open(copy_location, 'w', encoding='utf-8') as file:
        file.writelines(used_insert_statements)
    for index, statement in unused_insert_statement.items():
        print(f"未使用的插入语句如下：\n{index}: {statement}\n")
    print("脚本运行成功")
    print(f"已从labels.sql中删除{lines_deleted}行")
if __name__ == "__main__":
    search_decide()

在以上代码中，我已将代码中的注释进行了翻译。如果您需要更多帮助或有其他问题，请随时提出。

英文:

I am trying to find if the insert statement in labels.sql is needed anymore so if the textid in any insert statement for example Insert into EPORTAL_DATA.MLNGTXT (MODULEID,TEXTID,LANGUAGEID,MLTEXT,TEXTTYPE) values ('COMMON','SearchButton','en-ca','Search','LABEL'); exist in any of the other files than i will keep the insert statement otherwise i will delete it.

import os
import re
import shutil
current_dir_location = os.getcwd()
labels_script_location = os.path.join(current_dir_location, &#39;SQL-scripts/labels.sql&#39;)
copy_location = os.path.join(current_dir_location, &#39;SQL-scripts/lablesCopy.sql&#39;)
# Create a copy of the labels.sql file
shutil.copy2(labels_script_location, copy_location)
#Read insert statements from copy file
with open(copy_location, &#39;r&#39;,encoding=&#39;UTF-8&#39;) as file:
insert_statements_from_labels = file.readlines()
# Compile the regular expression pattern
text_id_pattern = re.compile(r&quot;&#39;[^&#39;]+&#39;\s*,\s*&#39;([^&#39;]+)&#39;&quot;)  # Regular expression for finding the values
def search_references(references_dir, text_id):
regex = re.compile(text_id)
for root, dirs, files in os.walk(references_dir):
if &#39;node_modules&#39; in dirs:
dirs.remove(&#39;node_modules&#39;)
if &#39;SQL-scripts&#39; in dirs:
dirs.remove(&#39;SQL-scripts&#39;)
for file_name in files:
file_path = os.path.join(root, file_name)
try:
with open(file_path, &#39;r&#39;, encoding=&#39;utf-8&#39;) as file:
content = file.read()
if regex.search(content):
return True # stop searching as soon as a match is found
except (UnicodeDecodeError,PermissionError): 
# some of the files are not encoded as UTF-8
# some files can not be read like read.lock
continue
return False
def get_text_id(insert_statement):    
match = text_id_pattern.search(insert_statement)
if match:
text_id = match.group(1)  # Retrieve the second captured group
else:
text_id = None
return text_id
def search_decide():
lines_deleted = 0
used_insert_statements = []
unused_insert_statement = {}
for index, insert_statement in enumerate(insert_statements_from_labels):
text_id = get_text_id(insert_statement)
if text_id:
is_used = search_references(current_dir_location, text_id)
if is_used:
used_insert_statements.append(insert_statement)
else:
unused_insert_statement[index] = insert_statement
lines_deleted += 1
with open(copy_location, &#39;w&#39;, encoding=&#39;utf-8&#39;) as file:
file.writelines(used_insert_statements)
for index, statement in unused_insert_statement.items():
print(f&quot;Unused Insert Statements are : \n{index}: {statement}\n&quot;)
print(&quot;Script Ran Successfully&quot;)
print(f&quot;{lines_deleted} lines were deleted from labels.sql&quot;)
if __name__ == &quot;__main__&quot;:
find_decide()

Is there any possibility of optimization or performance improvement in the code i am trying to read a 2000 line labels.sql file and check for references.Right now its taking around 10 min to finish the script.

These are some of the details i found using profiler and snakeviz

and the table

答案1

得分: 2

似乎您的问题是每次都在相同的、可能很大的目录结构中搜索，每个搜索词都要进行一次搜索。根据您的 ncalls 对于 open 函数的调用次数，您可能在这个树下有大约300个文件，而您正在重新打开和重新读取每个文件2000次。

几乎肯定更快的方式是只打开和读取每个文件一次，然后按顺序（或更好的是，在一次扫描中）搜索所有搜索词，然后再移动到下一个文件。或者如果可能的话，一开始只搜索少量的文件。

文件I/O是昂贵的，请尽量减少它。

英文:

Seems like your problem is that you're searching the same, presumably large, directory structure over and over, once for each search term. If you've got 2000 things to search for, based on your ncalls for open, you probably have 300-odd files under that tree, and you're reopening and rereading each of them 2000 times.

It would almost certainly be faster to only open and read each file once, and search it for all the search terms in sequence (or better, in a single pass) before moving to the next file. Or just search a smaller number of files in the first place if that's possible.

File I/O is expensive, try to do less of it.

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

Python脚本优化，在多个文件中搜索SQL。

问题

答案1

一个空的切片为什么会占用24个字节？

高效实现查找段数的部分是：

在监控资源使用情况时请求资源。

提取月份和实际年份从月份期间格式。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。