英文:
Merge to lists in reverse chronological order using regular expression python
问题
I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the '+' method. These are the two .txt files I am trying to merge.
file 1:
poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00
file 2:
ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47
So far my code is
My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?
英文:
I am trying to merge two lists in Python in reverse chronological order using regular expression. I'm a little lost, the only thing I can do to merge them without errors so far is concatenate them together using the '+' method. These are the two .txt files I am trying to merge.
file 1:
poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013 10 1 13:46:42
nohw4me "i have no idea what my cs prof is saying" 2013 10 1 12:07:14
pythondiva "My memory is great <3 64GB android" 2013 10 1 10:36:11
enigma "im so clever, my code is even unreadable to me!" 2013 10 1 09:27:00
file 2:
ocd_programmer "140 character limit? so i cant write my variable names" 2013 10 1 13:18:01
caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011 10 2 02:53:47
So far my code is
My question is how do I implement the merge_tweets() method to merge the two .txt files in reverse chronological order using regular expression?
import re
import sys
def read_tweets(file):
records_list = []
with open(file, 'r') as f:
for line in f:
match = re.search(r'@(\w+) "(.*)" (\d+) (\d+) (\d+) (\d+:\d+:\d+)', line)
if match:
records_list.append({
'tweeter': match.group(1),
'tweet': match.group(2),
'year': int(match.group(3)),
'month': int(match.group(4)),
'day': int(match.group(5)),
'time': match.group(6)
})
return records_list
def merge_tweets(list1, list2):
return list1 + list2
def write_tweets(records_list, file):
with open(file, 'w') as f:
for record in records_list:
f.write(
f'@{record["tweeter"]} "{record["tweet"]}" {record["year"]} {record["month"]} {record["day"]} {record["time"]}\n')
def main():
if len(sys.argv) != 4:
print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print('Reading files...')
records_list1 = read_tweets(file1)
records_list2 = read_tweets(file2)
if len(records_list1) > len(records_list2):
print(f'{file1} contained the most tweets with {len(records_list1)}.')
elif len(records_list2) > len(records_list1):
print(f'{file2} contained the most tweets with {len(records_list2)}.')
else:
print(f'{file1} and {file2} both contained {len(records_list1)} tweets.')
print('\nMerging files...')
records_list = merge_tweets(records_list1, records_list2)
print('Files merged.')
print('\nWriting file...')
write_tweets(records_list, output_file)
print('File written.')
答案1
得分: 0
我添加了一些辅助函数。以下是一些注释:
read_files()
执行合并操作,所以我取消了merge_tweets()
- 当处理时间戳时,
datetime
是有帮助的,格式化的时间戳会被写入文件(你可以在write_tweets()
中重新处理record["timestamp"]
并以你自己的格式再次写入) - 这些函数传递存储在内存中的列表,所以如果你有很多推文,请小心使用迭代器,它们在内存上更有效。我传递列表是因为你的函数也是这样做的。
import re
import sys
from datetime import datetime
def read_files(file1, file2):
records_list, file_lengths = [], []
for file in (file1, file2):
count = 0 # 避免使用 enumerate() 来避免空文件引发异常
with open(file, 'r') as f:
for line in f:
records_list.append(read_tweet(line))
count += 1
file_lengths.append(count)
print('文件已合并。')
return records_list, file_lengths
def read_tweet(line: str):
match = re.search(r'(\w+) "(.*)" (\d+) (\d+) (\d+) (\d+):(\d+):(\d+)', line)
if match:
return {
'tweeter': match.group(1),
'tweet': match.group(2),
'timestamp': datetime(
year=int(match.group(3)),
month=int(match.group(4)),
day=int(match.group(5)),
hour=int(match.group(6)),
minute=int(match.group(7)),
second=int(match.group(8)),
),
}
def sort_tweets(records_list):
return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)
def write_tweets(records_list, file):
with open(file, 'w') as f:
for record in records_list:
f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}\n')
def main():
if len(sys.argv) != 4:
print('用法: python twitter_sort.py <file1> <file2> <output_file>')
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print('正在读取文件...')
records_list = read_files(file1, file2)
records_list_values = sort_tweets(records_list[0])
records_list1_count, records_list2_count = records_list[1]
if records_list1_count > records_list2_count:
print(f'{file1} 包含了最多的推文,共 {records_list1_count} 条。')
elif records_list2_count > records_list1_count:
print(f'{file2} 包含了最多的推文,共 {records_list2_count} 条。')
else:
print(f'{file1} 和 {file2} 都包含了 {records_list1_count} 条推文。')
print('\n正在写入文件...')
write_tweets(records_list_values, output_file)
print('文件已写入。')
if __name__ == "__main__":
main()
输出:
@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47
英文:
I added some helper functions. A few comments:
read_files()
does the merging, so I eliminatedmerge_tweets()
datetime
is helpful when handling timestamps, and formatted timestamps are written to file (you can remanipulate them insiderecord["timestamp"]
inwrite_tweets()
, and write again in your own format)- these functions pass lists stored in memory, so be careful if you have many tweets, in that case use iterators, which are memory efficient. I passed lists because your functions do so.
import re
import sys
from datetime import datetime
def read_files(file1, file2):
records_list, file_lengths = [], []
for file in (file1, file2):
count = 0 # I avoided enumerate() to avoid exceptions from empty files
with open(file, 'r') as f:
for line in f:
records_list.append(read_tweet(line))
count += 1
file_lengths.append(count)
print('Files merged.')
return records_list, file_lengths
def read_tweet(line: str):
match = re.search(r'(\w+) "(.*)" (\d+) (\d+) (\d+) (\d+):(\d+):(\d+)', line)
if match:
return {
'tweeter': match.group(1),
'tweet': match.group(2),
'timestamp': datetime(
year=int(match.group(3)),
month=int(match.group(4)),
day=int(match.group(5)),
hour=int(match.group(6)),
minute=int(match.group(7)),
second=int(match.group(8)),
),
}
def sort_tweets(records_list):
return sorted(records_list, key=lambda x: x["timestamp"], reverse=True)
def write_tweets(records_list, file):
with open(file, 'w') as f:
for record in records_list:
f.write(f'@{record["tweeter"]} "{record["tweet"]}" {record["timestamp"]}\n')
def main():
if len(sys.argv) != 4:
print('Usage: python twitter_sort.py <file1> <file2> <output_file>')
sys.exit(1)
file1, file2, output_file = sys.argv[1], sys.argv[2], sys.argv[3]
print('Reading files...')
records_list = read_files(file1, file2)
records_list_values = sort_tweets(records_list[0])
records_list1_count, records_list2_count = records_list[1]
if records_list1_count > records_list2_count:
print(f'{file1} contained the most tweets with {records_list1_count}.')
elif records_list2_count > records_list1_count:
print(f'{file2} contained the most tweets with {records_list2_count}.')
else:
print(f'{file1} and {file2} both contained {records_list1_count} tweets.')
print('\nWriting file...')
write_tweets(records_list_values, output_file)
print('File written.')
if __name__ == "__main__":
main()
Output:
@poptardsarefamous "Sometimes I wonder 2 == b or !(2 == b)" 2013-10-01 13:46:42
@ocd_programmer "140 character limit? so i cant write my variable names" 2013-10-01 13:18:01
@nohw4me "i have no idea what my cs prof is saying" 2013-10-01 12:07:14
@pythondiva "My memory is great <3 64GB android" 2013-10-01 10:36:11
@enigma "im so clever, my code is even unreadable to me!" 2013-10-01 09:27:00
@caffeine4life "BBBBZZZZzzzzzZZZZZZZzzzZZzzZzzZzTTTTttt" 2011-10-02 02:53:47
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论