2023年7月13日 23:30:24go评论69阅读模式

英文:

Is there a way to make this document search function faster?

问题

from pathlib import Path

def build_paths():
    """构建所有站点上传文件目录的列表"""
    return list(Path(r"K:\BigDog").glob(r'*\Data\Files'))

def count_doc_files(doc_type, doc_list):
    """统计Live站点的doc和docx文件数量。"""
    file_dict = {}
    file_list = []
    for dir in build_paths():
        dir_parts = dir.parts
        site = dir_parts[3]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict

doc_list = ['.doc', '.docx']
print(count_doc_files('*.doc*', doc_list))

英文:

Is threading going to be my best bet?

I have the following and it works, but there are several folders it has to comb through and the grand total of files is just North of 500,000. Is there any way to speed this up? Multi processing?

from pathlib import Path


def build_paths():
    &quot;&quot;&quot;Build list of all site Uploaded Files dirs&quot;&quot;&quot;
    return list(Path(r&quot;K:\BigDog&quot;).glob(r&#39;*\Data\Files&#39;))


def count_doc_files(doc_type, doc_list):
    &quot;&quot;&quot;Count doc and docx files for Live sites.&quot;&quot;&quot;
    file_dict = {}
    file_list = []
    for dir in build_paths():
        dir_parts =  dir.parts
        site = dir_parts[3]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict


doc_list = [&#39;.doc&#39;, &#39;.docx&#39;]
print(count_doc_files(&#39;*.doc*&#39;, doc_list))

答案1

得分: 1

以下是代码部分的翻译：

假设您想要按站点计算文件数量，我稍微修改了您的代码，以在每次迭代时将`file_list`重置为空列表：

```python
import os
import random
import shutil
import tempfile
from pathlib import Path

from performance_measurement import run_performance_comparison


def count_doc_files(paths):
    doc_type = "*.doc*"
    doc_list = [".doc", ".docx"]
    """Count doc and docx files for Live sites."""
    file_dict = {}
    for dir in paths:
        file_list = []

        dir_parts = dir.parts
        site = dir_parts[-1]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict


def direct_glob(paths):
    """Replace the for loops and list construction with a direct dictionary comprehension"""
    doc_list = [".doc", ".docx"]
    doc_type = "*.doc*"

    file_dict = {
        dir.parts[-1]: sum(1 for file in dir.rglob(doc_type) if file.suffix in doc_list)
        for dir in paths
    }
    return file_dict

我们通过直接使用字典推导式来构建字典，避免了逐个构建列表的开销。但是这并没有太大的差异。

from collections import deque
from itertools import count

# 避免每次都构建deque，减少了固定开销，因此这对除了长度为0-1的输入以外的所有情况都更快
consumeall = deque(maxlen=0).extend


def ilen(it):
    # 参考自https://stackoverflow.com/a/34404546/4850343
    # 创建一个有状态的计数迭代器
    cnt = count()
    # 将其与输入迭代器一起使用，然后在C级别耗尽输入
    consumeall(zip(it, cnt))  # 必须将cnt作为zip的第二个参数以避免过度推进
    # 由于计数从0开始，下一个值就是计数
    return next(cnt)


def fast_counting_dictionary_comprehension(paths):
    """Count doc and docx files for Live sites."""
    doc_list = [".doc", ".docx"]
    doc_type = "*.doc*"

    file_dict = {
        dir.parts[-1]: ilen(
            file for file in dir.rglob(doc_type) if file.suffix in doc_list
        )
        for dir in paths
    }
    return file_dict

在这里，我们提高了计算迭代器长度的速度，但这也没有太大的差异。

def multi_glob(paths):
    """两次glob，跳过if语句"""
    doc_list = [".doc", ".docx"]

    file_dict = {
        dir.parts[-1]: sum(1 for extension in doc_list for file in dir.rglob(f'*{extension}') )
        for dir in paths
    }
    return file_dict

glob 本身似乎是性能的瓶颈，因为用显式重复的 glob 调用替换列表理解中的过滤器会显著减慢速度。

根据分析，不太可能有太大的改进空间。我还尝试使用 glob.iglob 进行 glob 操作，但性能要差得多。

def setup(subdirs=3, num_dirs_with_files=10):
    """生成具有doc和docx文件的目录树结构。"""
    doc_list = ["doc", "docx"]
    temp_dir = tempfile.mkdtemp()

    for i in range(subdirs):
        level_dir = os.path.join(temp_dir, f"level_{i}")
        os.mkdir(level_dir)

        if i < num_dirs_with_files:
            for doc_type in doc_list:
                for i in range(random.randint(1, 5)):
                    doc_file = os.path.join(level_dir, f"file_{i}.{doc_type}")
                    open(doc_file, "a").close()

    return [list(Path(temp_dir).glob("*"))]


def teardown(paths):
    for path in paths:
        shutil.rmtree(path)

approaches = [count_doc_files, direct_glob, fast_counting_dictionary_comprehension]

run_performance_comparison(
    approaches,
    [100, 200, 500, 1_000, 3_000, 10_000, 20_000],#50_0000,100_000,300_000,500_000],
    title="Performance Comparison",
    setup=setup,
    teardown=teardown,
    number_of_repetitions=1,
)

以上是代码的翻译。如果您需要更多的帮助或有其他问题，请随时提问。

英文:

Assuming you'd like to count the files per site I slightly modified your code to reset the file_list to an empty list on each iteration:

import os
import random
import shutil
import tempfile
from pathlib import Path

from performance_measurement import run_performance_comparison



def count_doc_files(paths):
    doc_type = &quot;*.doc*&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]
    &quot;&quot;&quot;Count doc and docx files for Live sites.&quot;&quot;&quot;
    file_dict = {}
    for dir in paths:
        file_list = []

        dir_parts = dir.parts
        site = dir_parts[-1]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict




def direct_glob(paths):
    &quot;&quot;&quot;Replace the for loops and list construction with a direct dictionary comprehension&quot;&quot;&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]
    doc_type = &quot;*.doc*&quot;

    file_dict = {
        dir.parts[-1]: sum(1 for file in dir.rglob(doc_type) if file.suffix in doc_list)
        for dir in paths
    }
    return file_dict

We shave of a little bit by direct dictionary construction with a comprehension, which avoids the overhead of constructing the list 1-by-1. However this doesn't make much of a difference.

from collections import deque
from itertools import count

# Avoid constructing a deque each time, reduces fixed overhead enough
# that this beats the sum solution for all but length 0-1 inputs
consumeall = deque(maxlen=0).extend


def ilen(it):
    # Taken from ShadowRanger&#39;s answer at https://stackoverflow.com/a/34404546/4850343
    # Make a stateful counting iterator
    cnt = count()
    # zip it with the input iterator, then drain until input exhausted at C level
    consumeall(zip(it, cnt))  # cnt must be second zip arg to avoid advancing too far
    # Since count 0 based, the next value is the count
    return next(cnt)


def fast_counting_dictionary_comprehension(paths):
    &quot;&quot;&quot;Count doc and docx files for Live sites.&quot;&quot;&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]
    doc_type = &quot;*.doc*&quot;

    file_dict = {
        dir.parts[-1]: ilen(
            file for file in dir.rglob(doc_type) if file.suffix in doc_list
        )
        for dir in paths
    }
    return file_dict

Here we increase the speed of counting the length of the iterator, but that also doesn't make a difference.

def multi_glob(paths):
    &quot;&quot;&quot;glob twice, skip the if statement&quot;&quot;&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]

    file_dict = {
        dir.parts[-1]: sum(1 for extension in doc_list for file in dir.rglob(f&#39;*{extension}&#39;) )
        for dir in paths
    }
    return file_dict

The globbing itself seems to be the performance hog, because replacing the filter in the list comprehension with explicit repeated glob calls significantly slows things down.

Profiling shows not much improvement is possible. I also tried globbing with glob.iglob but that was a lot worse even.

def setup(subdirs=3, num_dirs_with_files=10):
    &quot;&quot;&quot;Generate a directory tree structure with doc and docx files.&quot;&quot;&quot;
    doc_list = [&quot;doc&quot;, &quot;docx&quot;]
    temp_dir = tempfile.mkdtemp()

    for i in range(subdirs):
        level_dir = os.path.join(temp_dir, f&quot;level_{i}&quot;)
        os.mkdir(level_dir)

        if i &lt; num_dirs_with_files:
            for doc_type in doc_list:
                for i in range(random.randint(1, 5)):
                    doc_file = os.path.join(level_dir, f&quot;file_{i}.{doc_type}&quot;)
                    open(doc_file, &quot;a&quot;).close()

    return [list(Path(temp_dir).glob(&quot;*&quot;))]


def teardown(paths):
    for path in paths:
        shutil.rmtree(path)

approaches = [count_doc_files, direct_glob,fast_counting_dictionary_comprehension]

run_performance_comparison(
    approaches,
    [100, 200, 500, 1_000, 3_000, 10_000, 20_000],#50_0000,100_000,300_000,500_000],
    title=&quot;Performance Comparison&quot;,
    setup=setup,
    teardown=teardown,
    number_of_repetitions=1,
)

Profiling code:

import timeit
from functools import partial

import matplotlib.pyplot as plt
from typing import List, Dict, Callable

from contextlib import contextmanager


@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
    data = setup(data_size)
    yield data
    teardown(*data)


def run_performance_comparison(approaches: List[Callable],
                               data_size: List[int],
                               setup=lambda N: [N],
                               teardown=lambda: None,
                               number_of_repetitions=5, title=&#39;Performance Comparison&#39;, data_name=&#39;N&#39;):
    # First we show that all approaches return the same result
    with data_provider(100, setup, teardown) as data:
        for approach in approaches[1:]:
            result, expected = approach(*data), approaches[0](*data)
            assert result == expected, f&#39;{approach.__name__} returned {result} instead of {expected}&#39;
    approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
    for N in data_size:
        with data_provider(N, setup, teardown) as data:
            print(f&#39;Running performance comparison for {data_name}={N}&#39;)
            for approach in approaches:
                function = partial(approach, *data)
                approach_time = min(timeit.Timer(function).repeat(repeat=number_of_repetitions, number=1))
                approach_times[approach].append(approach_time)

    for approach in approaches:
        plt.plot(data_size, approach_times[approach], label=approach.__name__)
    plt.yscale(&#39;log&#39;)
    plt.xscale(&#39;log&#39;)

    plt.xlabel(data_name)
    plt.ylabel(&#39;Execution Time (seconds)&#39;)
    plt.title(title)
    plt.legend()
    plt.show()

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

有没有办法让这个文档搜索功能更快？

问题

答案1

根据阈值创建超集和子集。

将三列的NumPy数组转换为元组字典？

使用f2py将旧的Fortran77代码编译成Python模块。

如何生成具有一致“颤音”的正弦波？

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论