使用正则表达式在Python中按逆序合并两个列表

huangapple go评论58阅读模式
英文:

Merge to lists in reverse chronological order using regular expression python

问题

I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the '+' method. These are the two .txt files I am trying to merge.

file 1:

poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00

file 2:

ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47

So far my code is

My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?

英文:

I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the '+' method. These are the two .txt files I am trying to merge.

file 1:

poptardsarefamous &quot;Sometimes I wonder 2 == b or !(2 == b)&quot; 2013 10 1 13:46:42
nohw4me &quot;i have no idea what my cs prof is saying&quot; 2013 10 1 12:07:14
pythondiva &quot;My memory is great &lt;3 64GB android&quot; 2013 10 1 10:36:11
enigma &quot;im so clever, my code is even unreadable to me!&quot; 2013 10 1 09:27:00

file 2:

ocd_programmer &quot;140 character limit? so i cant write my variable names&quot; 2013 10 1 13:18:01
caffeine4life &quot;BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt&quot; 2011 10 2 02:53:47

So far my code is

My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?

import re
import sys

def read_tweets(file):

    records_list = []
    with open(file, &#39;r&#39;) as f:
        for line in f:
            match = re.search(r&#39;@(\w+) &quot;(.*)&quot; (\d+) (\d+) (\d+) (\d+:\d+:\d+)&#39;, line)
            if match:
                records_list.append({
                    &#39;tweeter&#39;: match.group(1),
                    &#39;tweet&#39;: match.group(2),
                    &#39;year&#39;: int(match.group(3)),
                    &#39;month&#39;: int(match.group(4)),
                    &#39;day&#39;: int(match.group(5)),
                    &#39;time&#39;: match.group(6)
                })
    return records_list

def merge_tweets(list1, list2):
    return list1 + list2

def write_tweets(records_list, file):
    with open(file, &#39;w&#39;) as f:
        for record in records_list:
            f.write(
                f&#39;@{record[&quot;tweeter&quot;]} &quot;{record[&quot;tweet&quot;]}&quot; {record[&quot;year&quot;]} {record[&quot;month&quot;]} {record[&quot;day&quot;]} {record[&quot;time&quot;]}\n&#39;)

def main():
    if len(sys.argv) != 4:
        print(&#39;Usage: python twitter_sort.py &lt;file1&gt; &lt;file2&gt; &lt;output_file&gt;&#39;)
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print(&#39;Reading files...&#39;)
    records_list1 = read_tweets(file1)
    records_list2 = read_tweets(file2)

    if len(records_list1) &gt; len(records_list2):
        print(f&#39;{file1} contained the most tweets with {len(records_list1)}.&#39;)
    elif len(records_list2) &gt; len(records_list1):
        print(f&#39;{file2} contained the most tweets with {len(records_list2)}.&#39;)
    else:
        print(f&#39;{file1} and {file2} both contained {len(records_list1)} tweets.&#39;)

    print(&#39;\nMerging files...&#39;)
    records_list = merge_tweets(records_list1, records_list2)
    print(&#39;Files merged.&#39;)

    print(&#39;\nWriting file...&#39;)
    write_tweets(records_list, output_file)
    print(&#39;File written.&#39;)

答案1

得分: 0

我添加了一些辅助函数。以下是一些注释:

  • read_files() 执行合并操作,所以我取消了 merge_tweets()
  • 当处理时间戳时,datetime 是有帮助的,格式化的时间戳会被写入文件(你可以在 write_tweets() 中重新处理 record["timestamp"] 并以你自己的格式再次写入)
  • 这些函数传递存储在内存中的列表,所以如果你有很多推文,请小心使用迭代器,它们在内存上更有效。我传递列表是因为你的函数也是这样做的。
import re
import sys
from datetime import datetime

def read_files(file1, file2):
    records_list, file_lengths = [], []
    for file in (file1, file2):
        count = 0  # 避免使用 enumerate() 来避免空文件引发异常
        with open(file, 'r') as f:
            for line in f:
                records_list.append(read_tweet(line))
                count += 1
        file_lengths.append(count)
    print('文件已合并。')
    return records_list, file_lengths

def read_tweet(line: str):
    match = re.search(r'(\w+) "(.*)" (\d+) (\d+) (\d+) (\d+):(\d+):(\d+)', line)
    if match:
        return {
            'tweeter': match.group(1),
            'tweet': match.group(2),
            'timestamp': datetime(
                year=int(match.group(3)),
                month=int(match.group(4)),
                day=int(match.group(5)),
                hour=int(match.group(6)),
                minute=int(match.group(7)),
                second=int(match.group(8)),
            ),
        }

def sort_tweets(records_list):
    return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)

def write_tweets(records_list, file):
    with open(file, 'w') as f:
        for record in records_list:
            f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}\n')

def main():
    if len(sys.argv) != 4:
        print('用法: python twitter_sort.py <file1> <file2> <output_file>')
        sys.exit(1)

    file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]

    print('正在读取文件...')
    records_list = read_files(file1, file2)

    records_list_values = sort_tweets(records_list[0])
    records_list1_count, records_list2_count = records_list[1]

    if records_list1_count > records_list2_count:
        print(f'{file1} 包含了最多的推文,共 {records_list1_count} 条。')
    elif records_list2_count > records_list1_count:
        print(f'{file2} 包含了最多的推文,共 {records_list2_count} 条。')
    else:
        print(f'{file1}{file2} 都包含了 {records_list1_count} 条推文。')

    print('\n正在写入文件...')
    write_tweets(records_list_values, output_file)
    print('文件已写入。')

if __name__ == "__main__":
    main()

输出:

@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47
英文:

I added some helper functions. A few comments:

  • read_files() does the merging, so I eliminated merge_tweets()
  • datetime is helpful when handling timestamps, and formatted timestamps are written to file (you can remanipulate them insiderecord[&quot;timestamp&quot;] in write_tweets(), and write again in your own format)
  • these functions pass lists stored in memory, so be careful if you have many tweets, in that case use iterators, which are memory efficient. I passed lists because your functions do so.
import re
import sys
from datetime import datetime
def read_files(file1, file2):
records_list, file_lengths = [], []
for file in (file1, file2):
count = 0  # I avoided enumerate() to avoid exceptions from empty files
with open(file, &#39;r&#39;) as f:
for line in f:
records_list.append(read_tweet(line))
count += 1
file_lengths.append(count)
print(&#39;Files merged.&#39;)
return records_list, file_lengths
def read_tweet(line: str):
match = re.search(r&#39;(\w+) &quot;(.*)&quot; (\d+) (\d+) (\d+) (\d+):(\d+):(\d+)&#39;, line)
if match:
return {
&#39;tweeter&#39;: match.group(1),
&#39;tweet&#39;: match.group(2),
&#39;timestamp&#39;: datetime(
year=int(match.group(3)),
month=int(match.group(4)),
day=int(match.group(5)),
hour=int(match.group(6)),
minute=int(match.group(7)),
second=int(match.group(8)),
),
}
def sort_tweets(records_list):
return sorted(records_list, key=lambda x: x[&quot;timestamp&quot;], reverse=True)
def write_tweets(records_list, file):
with open(file, &#39;w&#39;) as f:
for record in records_list:
f.write(f&#39;@{record[&quot;tweeter&quot;]} &quot;{record[&quot;tweet&quot;]}&quot; {record[&quot;timestamp&quot;]}\n&#39;)
def main():
if len(sys.argv) != 4:
print(&#39;Usage: python twitter_sort.py &lt;file1&gt; &lt;file2&gt; &lt;output_file&gt;&#39;)
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print(&#39;Reading files...&#39;)
records_list = read_files(file1, file2)
records_list_values = sort_tweets(records_list[0])
records_list1_count, records_list2_count = records_list[1]
if records_list1_count &gt; records_list2_count:
print(f&#39;{file1} contained the most tweets with {records_list1_count}.&#39;)
elif records_list2_count &gt; records_list1_count:
print(f&#39;{file2} contained the most tweets with {records_list2_count}.&#39;)
else:
print(f&#39;{file1} and {file2} both contained {records_list1_count} tweets.&#39;)
print(&#39;\nWriting file...&#39;)
write_tweets(records_list_values, output_file)
print(&#39;File written.&#39;)
if __name__ == &quot;__main__&quot;:
main()

Output:

@poptardsarefamous &quot;Sometimes I wonder 2 == b or !(2 == b)&quot; 2013-10-01 13:46:42
@ocd_programmer &quot;140 character limit? so i cant write my variable names&quot; 2013-10-01 13:18:01
@nohw4me &quot;i have no idea what my cs prof is saying&quot; 2013-10-01 12:07:14
@pythondiva &quot;My memory is great &lt;3 64GB android&quot; 2013-10-01 10:36:11
@enigma &quot;im so clever, my code is even unreadable to me!&quot; 2013-10-01 09:27:00
@caffeine4life &quot;BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt&quot; 2011-10-02 02:53:47

huangapple
  • 本文由 发表于 2023年2月6日 08:45:52
  • 转载请务必保留本文链接:https://go.coder-hub.com/75356507.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定