有没有办法让这个文档搜索功能更快?

huangapple go评论59阅读模式
英文:

Is there a way to make this document search function faster?

问题

from pathlib import Path

def build_paths():
    """构建所有站点上传文件目录的列表"""
    return list(Path(r"K:\BigDog").glob(r'*\Data\Files'))

def count_doc_files(doc_type, doc_list):
    """统计Live站点的doc和docx文件数量。"""
    file_dict = {}
    file_list = []
    for dir in build_paths():
        dir_parts = dir.parts
        site = dir_parts[3]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict

doc_list = ['.doc', '.docx']
print(count_doc_files('*.doc*', doc_list))
英文:

Is threading going to be my best bet?

I have the following and it works, but there are several folders it has to comb through and the grand total of files is just North of 500,000. Is there any way to speed this up? Multi processing?

from pathlib import Path


def build_paths():
    """Build list of all site Uploaded Files dirs"""
    return list(Path(r"K:\BigDog").glob(r'*\Data\Files'))


def count_doc_files(doc_type, doc_list):
    """Count doc and docx files for Live sites."""
    file_dict = {}
    file_list = []
    for dir in build_paths():
        dir_parts =  dir.parts
        site = dir_parts[3]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict


doc_list = ['.doc', '.docx']
print(count_doc_files('*.doc*', doc_list))

答案1

得分: 1

以下是代码部分的翻译:

假设您想要按站点计算文件数量我稍微修改了您的代码以在每次迭代时将`file_list`重置为空列表

```python
import os
import random
import shutil
import tempfile
from pathlib import Path

from performance_measurement import run_performance_comparison


def count_doc_files(paths):
    doc_type = "*.doc*"
    doc_list = [".doc", ".docx"]
    """Count doc and docx files for Live sites."""
    file_dict = {}
    for dir in paths:
        file_list = []

        dir_parts = dir.parts
        site = dir_parts[-1]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict


def direct_glob(paths):
    """Replace the for loops and list construction with a direct dictionary comprehension"""
    doc_list = [".doc", ".docx"]
    doc_type = "*.doc*"

    file_dict = {
        dir.parts[-1]: sum(1 for file in dir.rglob(doc_type) if file.suffix in doc_list)
        for dir in paths
    }
    return file_dict

我们通过直接使用字典推导式来构建字典,避免了逐个构建列表的开销。但是这并没有太大的差异。

from collections import deque
from itertools import count

# 避免每次都构建deque,减少了固定开销,因此这对除了长度为0-1的输入以外的所有情况都更快
consumeall = deque(maxlen=0).extend


def ilen(it):
    # 参考自https://stackoverflow.com/a/34404546/4850343
    # 创建一个有状态的计数迭代器
    cnt = count()
    # 将其与输入迭代器一起使用,然后在C级别耗尽输入
    consumeall(zip(it, cnt))  # 必须将cnt作为zip的第二个参数以避免过度推进
    # 由于计数从0开始,下一个值就是计数
    return next(cnt)


def fast_counting_dictionary_comprehension(paths):
    """Count doc and docx files for Live sites."""
    doc_list = [".doc", ".docx"]
    doc_type = "*.doc*"

    file_dict = {
        dir.parts[-1]: ilen(
            file for file in dir.rglob(doc_type) if file.suffix in doc_list
        )
        for dir in paths
    }
    return file_dict

在这里,我们提高了计算迭代器长度的速度,但这也没有太大的差异。

def multi_glob(paths):
    """两次glob,跳过if语句"""
    doc_list = [".doc", ".docx"]

    file_dict = {
        dir.parts[-1]: sum(1 for extension in doc_list for file in dir.rglob(f'*{extension}') )
        for dir in paths
    }
    return file_dict

glob 本身似乎是性能的瓶颈,因为用显式重复的 glob 调用替换列表理解中的过滤器会显著减慢速度。

根据分析,不太可能有太大的改进空间。我还尝试使用 glob.iglob 进行 glob 操作,但性能要差得多。

def setup(subdirs=3, num_dirs_with_files=10):
    """生成具有doc和docx文件的目录树结构。"""
    doc_list = ["doc", "docx"]
    temp_dir = tempfile.mkdtemp()

    for i in range(subdirs):
        level_dir = os.path.join(temp_dir, f"level_{i}")
        os.mkdir(level_dir)

        if i < num_dirs_with_files:
            for doc_type in doc_list:
                for i in range(random.randint(1, 5)):
                    doc_file = os.path.join(level_dir, f"file_{i}.{doc_type}")
                    open(doc_file, "a").close()

    return [list(Path(temp_dir).glob("*"))]


def teardown(paths):
    for path in paths:
        shutil.rmtree(path)

approaches = [count_doc_files, direct_glob, fast_counting_dictionary_comprehension]

run_performance_comparison(
    approaches,
    [100, 200, 500, 1_000, 3_000, 10_000, 20_000],#50_0000,100_000,300_000,500_000],
    title="Performance Comparison",
    setup=setup,
    teardown=teardown,
    number_of_repetitions=1,
)

以上是代码的翻译。如果您需要更多的帮助或有其他问题,请随时提问。

英文:

Assuming you'd like to count the files per site I slightly modified your code to reset the file_list to an empty list on each iteration:

import os
import random
import shutil
import tempfile
from pathlib import Path

from performance_measurement import run_performance_comparison



def count_doc_files(paths):
    doc_type = &quot;*.doc*&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]
    &quot;&quot;&quot;Count doc and docx files for Live sites.&quot;&quot;&quot;
    file_dict = {}
    for dir in paths:
        file_list = []

        dir_parts = dir.parts
        site = dir_parts[-1]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict




def direct_glob(paths):
    &quot;&quot;&quot;Replace the for loops and list construction with a direct dictionary comprehension&quot;&quot;&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]
    doc_type = &quot;*.doc*&quot;

    file_dict = {
        dir.parts[-1]: sum(1 for file in dir.rglob(doc_type) if file.suffix in doc_list)
        for dir in paths
    }
    return file_dict

We shave of a little bit by direct dictionary construction with a comprehension, which avoids the overhead of constructing the list 1-by-1. However this doesn't make much of a difference.

from collections import deque
from itertools import count

# Avoid constructing a deque each time, reduces fixed overhead enough
# that this beats the sum solution for all but length 0-1 inputs
consumeall = deque(maxlen=0).extend


def ilen(it):
    # Taken from ShadowRanger&#39;s answer at https://stackoverflow.com/a/34404546/4850343
    # Make a stateful counting iterator
    cnt = count()
    # zip it with the input iterator, then drain until input exhausted at C level
    consumeall(zip(it, cnt))  # cnt must be second zip arg to avoid advancing too far
    # Since count 0 based, the next value is the count
    return next(cnt)


def fast_counting_dictionary_comprehension(paths):
    &quot;&quot;&quot;Count doc and docx files for Live sites.&quot;&quot;&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]
    doc_type = &quot;*.doc*&quot;

    file_dict = {
        dir.parts[-1]: ilen(
            file for file in dir.rglob(doc_type) if file.suffix in doc_list
        )
        for dir in paths
    }
    return file_dict

Here we increase the speed of counting the length of the iterator, but that also doesn't make a difference.

def multi_glob(paths):
    &quot;&quot;&quot;glob twice, skip the if statement&quot;&quot;&quot;
    doc_list = [&quot;.doc&quot;, &quot;.docx&quot;]

    file_dict = {
        dir.parts[-1]: sum(1 for extension in doc_list for file in dir.rglob(f&#39;*{extension}&#39;) )
        for dir in paths
    }
    return file_dict

The globbing itself seems to be the performance hog, because replacing the filter in the list comprehension with explicit repeated glob calls significantly slows things down.

Profiling shows not much improvement is possible. I also tried globbing with glob.iglob but that was a lot worse even.

def setup(subdirs=3, num_dirs_with_files=10):
    &quot;&quot;&quot;Generate a directory tree structure with doc and docx files.&quot;&quot;&quot;
    doc_list = [&quot;doc&quot;, &quot;docx&quot;]
    temp_dir = tempfile.mkdtemp()

    for i in range(subdirs):
        level_dir = os.path.join(temp_dir, f&quot;level_{i}&quot;)
        os.mkdir(level_dir)

        if i &lt; num_dirs_with_files:
            for doc_type in doc_list:
                for i in range(random.randint(1, 5)):
                    doc_file = os.path.join(level_dir, f&quot;file_{i}.{doc_type}&quot;)
                    open(doc_file, &quot;a&quot;).close()

    return [list(Path(temp_dir).glob(&quot;*&quot;))]


def teardown(paths):
    for path in paths:
        shutil.rmtree(path)

approaches = [count_doc_files, direct_glob,fast_counting_dictionary_comprehension]

run_performance_comparison(
    approaches,
    [100, 200, 500, 1_000, 3_000, 10_000, 20_000],#50_0000,100_000,300_000,500_000],
    title=&quot;Performance Comparison&quot;,
    setup=setup,
    teardown=teardown,
    number_of_repetitions=1,
)

有没有办法让这个文档搜索功能更快?

Profiling code:

import timeit
from functools import partial

import matplotlib.pyplot as plt
from typing import List, Dict, Callable

from contextlib import contextmanager


@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
    data = setup(data_size)
    yield data
    teardown(*data)


def run_performance_comparison(approaches: List[Callable],
                               data_size: List[int],
                               setup=lambda N: [N],
                               teardown=lambda: None,
                               number_of_repetitions=5, title=&#39;Performance Comparison&#39;, data_name=&#39;N&#39;):
    # First we show that all approaches return the same result
    with data_provider(100, setup, teardown) as data:
        for approach in approaches[1:]:
            result, expected = approach(*data), approaches[0](*data)
            assert result == expected, f&#39;{approach.__name__} returned {result} instead of {expected}&#39;
    approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
    for N in data_size:
        with data_provider(N, setup, teardown) as data:
            print(f&#39;Running performance comparison for {data_name}={N}&#39;)
            for approach in approaches:
                function = partial(approach, *data)
                approach_time = min(timeit.Timer(function).repeat(repeat=number_of_repetitions, number=1))
                approach_times[approach].append(approach_time)

    for approach in approaches:
        plt.plot(data_size, approach_times[approach], label=approach.__name__)
    plt.yscale(&#39;log&#39;)
    plt.xscale(&#39;log&#39;)

    plt.xlabel(data_name)
    plt.ylabel(&#39;Execution Time (seconds)&#39;)
    plt.title(title)
    plt.legend()
    plt.show()

huangapple
  • 本文由 发表于 2023年7月13日 23:30:24
  • 转载请务必保留本文链接:https://go.coder-hub.com/76681096.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定