2023年2月6日 08:45:52go评论61阅读模式

英文:

Merge to lists in reverse chronological order using regular expression python

问题

I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the '+' method. These are the two .txt files I am trying to merge.

file 1:

poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00

file 2:

ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47

So far my code is

My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?

英文:

file 1:

poptardsarefamous &quot;Sometimes I wonder 2 == b or !(2 == b)&quot; 2013 10 1 13:46:42
nohw4me &quot;i have no idea what my cs prof is saying&quot; 2013 10 1 12:07:14
pythondiva &quot;My memory is great &lt;3 64GB android&quot; 2013 10 1 10:36:11
enigma &quot;im so clever, my code is even unreadable to me!&quot; 2013 10 1 09:27:00

file 2:

ocd_programmer &quot;140 character limit? so i cant write my variable names&quot; 2013 10 1 13:18:01
caffeine4life &quot;BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt&quot; 2011 10 2 02:53:47

So far my code is

My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?

import re
import sys

def read_tweets(file):

    records_list = []
    with open(file, &#39;r&#39;) as f:
        for line in f:
            match = re.search(r&#39;@(\w+) &quot;(.*)&quot; (\d+) (\d+) (\d+) (\d+:\d+:\d+)&#39;, line)
            if match:
                records_list.append({
                    &#39;tweeter&#39;: match.group(1),
                    &#39;tweet&#39;: match.group(2),
                    &#39;year&#39;: int(match.group(3)),
                    &#39;month&#39;: int(match.group(4)),
                    &#39;day&#39;: int(match.group(5)),
                    &#39;time&#39;: match.group(6)
                })
    return records_list

def merge_tweets(list1, list2):
    return list1 + list2

def write_tweets(records_list, file):
    with open(file, &#39;w&#39;) as f:
        for record in records_list:
            f.write(
                f&#39;@{record[&quot;tweeter&quot;]} &quot;{record[&quot;tweet&quot;]}&quot; {record[&quot;year&quot;]} {record[&quot;month&quot;]} {record[&quot;day&quot;]} {record[&quot;time&quot;]}\n&#39;)

def main():
    if len(sys.argv) != 4:
        print(&#39;Usage: python twitter_sort.py &lt;file1&gt; &lt;file2&gt; &lt;output_file&gt;&#39;)
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print(&#39;Reading files...&#39;)
    records_list1 = read_tweets(file1)
    records_list2 = read_tweets(file2)

    if len(records_list1) &gt; len(records_list2):
        print(f&#39;{file1} contained the most tweets with {len(records_list1)}.&#39;)
    elif len(records_list2) &gt; len(records_list1):
        print(f&#39;{file2} contained the most tweets with {len(records_list2)}.&#39;)
    else:
        print(f&#39;{file1} and {file2} both contained {len(records_list1)} tweets.&#39;)

    print(&#39;\nMerging files...&#39;)
    records_list = merge_tweets(records_list1, records_list2)
    print(&#39;Files merged.&#39;)

    print(&#39;\nWriting file...&#39;)
    write_tweets(records_list, output_file)
    print(&#39;File written.&#39;)

答案1

得分: 0

我添加了一些辅助函数。以下是一些注释：

read_files() 执行合并操作，所以我取消了 merge_tweets()
当处理时间戳时，datetime 是有帮助的，格式化的时间戳会被写入文件（你可以在 write_tweets() 中重新处理 record["timestamp"] 并以你自己的格式再次写入）
这些函数传递存储在内存中的列表，所以如果你有很多推文，请小心使用迭代器，它们在内存上更有效。我传递列表是因为你的函数也是这样做的。

import re
import sys
from datetime import datetime

def read_files(file1, file2):
    records_list, file_lengths = [], []
    for file in (file1, file2):
        count = 0  # 避免使用 enumerate() 来避免空文件引发异常
        with open(file, 'r') as f:
            for line in f:
                records_list.append(read_tweet(line))
                count += 1
        file_lengths.append(count)
    print('文件已合并。')
    return records_list, file_lengths

def read_tweet(line: str):
    match = re.search(r'(\w+) "(.*)" (\d+) (\d+) (\d+) (\d+):(\d+):(\d+)', line)
    if match:
        return {
            'tweeter': match.group(1),
            'tweet': match.group(2),
            'timestamp': datetime(
                year=int(match.group(3)),
                month=int(match.group(4)),
                day=int(match.group(5)),
                hour=int(match.group(6)),
                minute=int(match.group(7)),
                second=int(match.group(8)),
            ),
        }

def sort_tweets(records_list):
    return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)

def write_tweets(records_list, file):
    with open(file, 'w') as f:
        for record in records_list:
            f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}\n')

def main():
    if len(sys.argv) != 4:
        print('用法: python twitter_sort.py <file1> <file2> <output_file>')
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print('正在读取文件...')
    records_list = read_files(file1, file2)

    records_list_values = sort_tweets(records_list[0])
    records_list1_count, records_list2_count = records_list[1]

    if records_list1_count > records_list2_count:
        print(f'{file1} 包含了最多的推文，共 {records_list1_count} 条。')
    elif records_list2_count > records_list1_count:
        print(f'{file2} 包含了最多的推文，共 {records_list2_count} 条。')
    else:
        print(f'{file1} 和 {file2} 都包含了 {records_list1_count} 条推文。')

    print('\n正在写入文件...')
    write_tweets(records_list_values, output_file)
    print('文件已写入。')

if __name__ == "__main__":
    main()

输出：

@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47

英文:

I added some helper functions. A few comments:

read_files() does the merging, so I eliminated merge_tweets()
datetime is helpful when handling timestamps, and formatted timestamps are written to file (you can remanipulate them insiderecord["timestamp"] in write_tweets(), and write again in your own format)
these functions pass lists stored in memory, so be careful if you have many tweets, in that case use iterators, which are memory efficient. I passed lists because your functions do so.

import re
import sys
from datetime import datetime
def read_files(file1, file2):
records_list, file_lengths = [], []
for file in (file1, file2):
count = 0  # I avoided enumerate() to avoid exceptions from empty files
with open(file, &#39;r&#39;) as f:
for line in f:
records_list.append(read_tweet(line))
count += 1
file_lengths.append(count)
print(&#39;Files merged.&#39;)
return records_list, file_lengths
def read_tweet(line: str):
match = re.search(r&#39;(\w+) &quot;(.*)&quot; (\d+) (\d+) (\d+) (\d+):(\d+):(\d+)&#39;, line)
if match:
return {
&#39;tweeter&#39;: match.group(1),
&#39;tweet&#39;: match.group(2),
&#39;timestamp&#39;: datetime(
year=int(match.group(3)),
month=int(match.group(4)),
day=int(match.group(5)),
hour=int(match.group(6)),
minute=int(match.group(7)),
second=int(match.group(8)),
),
}
def sort_tweets(records_list):
return sorted(records_list, key=lambda x: x[&quot;timestamp&quot;], reverse=True)
def write_tweets(records_list, file):
with open(file, &#39;w&#39;) as f:
for record in records_list:
f.write(f&#39;@{record[&quot;tweeter&quot;]} &quot;{record[&quot;tweet&quot;]}&quot; {record[&quot;timestamp&quot;]}\n&#39;)
def main():
if len(sys.argv) != 4:
print(&#39;Usage: python twitter_sort.py &lt;file1&gt; &lt;file2&gt; &lt;output_file&gt;&#39;)
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print(&#39;Reading files...&#39;)
records_list = read_files(file1, file2)
records_list_values = sort_tweets(records_list[0])
records_list1_count, records_list2_count = records_list[1]
if records_list1_count &gt; records_list2_count:
print(f&#39;{file1} contained the most tweets with {records_list1_count}.&#39;)
elif records_list2_count &gt; records_list1_count:
print(f&#39;{file2} contained the most tweets with {records_list2_count}.&#39;)
else:
print(f&#39;{file1} and {file2} both contained {records_list1_count} tweets.&#39;)
print(&#39;\nWriting file...&#39;)
write_tweets(records_list_values, output_file)
print(&#39;File written.&#39;)
if __name__ == &quot;__main__&quot;:
main()

Output:

@poptardsarefamous &quot;Sometimes I wonder 2 == b or !(2 == b)&quot; 2013-10-01 13:46:42
@ocd_programmer &quot;140 character limit? so i cant write my variable names&quot; 2013-10-01 13:18:01
@nohw4me &quot;i have no idea what my cs prof is saying&quot; 2013-10-01 12:07:14
@pythondiva &quot;My memory is great &lt;3 64GB android&quot; 2013-10-01 10:36:11
@enigma &quot;im so clever, my code is even unreadable to me!&quot; 2013-10-01 09:27:00
@caffeine4life &quot;BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt&quot; 2011-10-02 02:53:47

通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库，让每个人都能够通过互相帮助和分享经验来进步。

使用正则表达式在Python中按逆序合并两个列表

问题

答案1

Using Python 3.10, but Pyright LSP throws error "Pyright: Alternative syntax for unions requires Python 3.10 or newer"

why is this showing a TypeError: View_Account.go_to_modify_account() missing 1 required positional argument: 'id'

Go validator.v2在正则表达式中出现错误”unknown tag”。

最佳方法创建一个使用GPT和Bert架构的问题生成模型是什么？

What's the correct way to type hint an empty list as a literal in python?

如何在Highcharts Gantt中更改本地化的星期名称

如何在同一个流中使用多个过滤器和映射函数？

如何使用Map/Set来将代码优化到O(n)？

.NET MAUI Android在GitHub Actions上构建失败，错误代码为1。

如何在Playwright视觉比较中屏蔽多个定位器？

在C++中，可以使用可变模板参数来检索类型的内部类型。

selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: stale element not found

Creating and opening a URL to log in to Website via Basic Auth with Robot Framework/Selenium (Python)

AG Grid 在上下文菜单中以大文本形式打开

发表评论