英文:
Python script to combine RTF files in a folder to CSV with filename as a separate column
问题
I've tried a plethora of solutions which don't work. I've figured out how to get the converted data into a csv column, but all text is in 1 cell and I haven't figured out how to get the FileNames added as a column.
import pandas as pd
import os
from striprtf.striprtf import rtf_to_text
dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'
def getFiles():
list = []
FileNames = []
for path in os.listdir(dir_path):
if os.path.isfile(os.path.join(dir_path, path)):
with open(os.path.join(dir_path, path)) as file:
text = file.read()
rtfText = rtf_to_text(text, encoding='utf-8')
for text in rtfText:
list.append(rtfText)
FileNames.append(os.path.basename(rtfText))
return list, FileNames
list, FileNames = getFiles()
Data = pd.DataFrame({'FileNames': FileNames, 'Data': list})
NewPath = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'
Data.to_csv(os.path.join(NewPath, r'Data.csv'), index=False, header=False)
I've tried for a few days to scrape Stackoverflow and find a solution and now I seem to be getting duplicate file data in each row?
I think my main issues are. Possible that I need to create an empty dataframe before the function, but I haven't got it working yet.
- separating the text so each new line is a new cell
- Distinct file content so there aren't duplicate rows
- adding the FileNames.
Hopefully, the outcome looks like this...
Filename | Data |
---|---|
Filename_1 | Data line 1 |
Filename_1 | Data line 2 |
Filename_2 | Data line 1 |
Filename_2 | Data line 2 |
Filename_2 | Data line 3 |
Thank you for any help
英文:
I've tried a plethora of solutions which don't work. I've figured out how to get the converted data into a csv column, but all text is in 1 cell and I haven't figured out how to get the FileNames added as a column.
import pandas as pd
import os
from striprtf.striprtf import rtf_to_text
dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'
def getFiles():
list = []
FileNames = []
for path in os.listdir(dir_path):
if os.path.isfile(os.path.join(dir_path,path)):
with open (os.path.join(dir_path,path)) as file:
text = file.read()
rtfText = rtf_to_text(text,encoding='utf-8')
for text in rtfText:
list.append(rtfText)
FileNames.append(os.path.basename(rtfText))
return list
return FileNames
list = getFiles()
FileNames = getFiles()
Data = pd.DataFrame(columns: 'list','FileNames')
NewPath = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'
Data.to_csv(os.path.join(NewPath,r'Data.csv'), index = False, header = False)
I've tried for a few days to scrape Stackoverflow and find a solution and now I seem to be getting duplicate file data in each row ?
I think my main issues are. Possible that I need to create an empty dataframe before the function, but I haven't got it working yet.
- separating the text so each new line is a new cell
- Distinct file content so there aren't duplicate rows
- adding the FileNames.
Hopefully, the outcome looks like this...
Filename | Data |
---|---|
Filename_1 | Data line 1 |
Filename_1 | Data line 2 |
Filename_2 | Data line 1 |
Filename_2 | Data line 2 |
Filename_2 | Data line 3 |
Thank you for any help
答案1
得分: 0
import pandas as pd
import os
from striprtf.striprtf import rtf_to_text
数据结构
class fileLine:
def init(self, fileName, textLine):
self.fileName = fileName
self.textLine = textLine
数据文件目录
data_dir_path = 'C:\Users\mairi\Desktop\testing txt to excel\'
output_dir_path = 'C:\Users\mairi\Desktop\testing txt to excel\NEW\'
获取给定目录中所有RTF文件的所有行
def getRTFLines():
list = []
# 对于给定目录中的每个文件
for filename in os.listdir(data_dir_path):
filePath = os.path.join(data_dir_path, filename)
# 检查它是否是RTF文件
if os.path.isfile(filePath) and filePath.endswith('.rtf'):
# 打开RTF文件
with open(filePath, encoding='utf-8', errors='ignore') as file:
# 读取RTF文件的所有内容
text = file.read()
# 解码RTF文件的内容
rtfText = rtf_to_text(text, encoding='utf-8')
# 对于RTF文件中的每一行
for line in rtfText.splitlines():
# 检查它是否为空行
if line:
# 将行插入到我们的数据结构中
list.append(fileLine(filename, line))
return list
读取数据
list = getRTFLines()
将数据转换为CSV文件
data = [[x.fileName, x.textLine] for x in list]
df = pd.DataFrame(data, columns=['文件名', '文本行'])
df.to_csv(os.path.join(output_dir_path, r'rtfData-3.csv'), escapechar="")
英文:
A friend helped me solve this pickle
I was converting and saving all text in the document instead of stripping it line by line.
import pandas as pd
import os
from striprtf.striprtf import rtf_to_text
# Data structure
class fileLine:
def __init__(self, fileName, textLine):
self.fileName = fileName
self.textLine = textLine
# Data files directories
data_dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'
output_dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'
# Get all lines of all RTF files in a giver directory
def getRTFLines():
list = []
# For each file in the given directory
for filename in os.listdir(data_dir_path):
filePath = os.path.join(data_dir_path, filename)
# Check if it's an RTF file
if os.path.isfile(filePath) and filePath.endswith('.rtf'):
# Open the RTF file
with open(filePath, encoding='utf-8', errors='ignore') as file:
# Read all the RTF file's content
text = file.read()
# Decode the RTF file's content
rtfText = rtf_to_text(text, encoding='utf-8')
# For each line in the RTF file
for line in rtfText.splitlines():
# Check if it's an empty line
if line:
# Insert the line in our data structure
list.append(fileLine(filename, line))
return list
# Read data
list = getRTFLines()
# Convert data to csv file
data = [[x.fileName, x.textLine] for x in list]
df = pd.DataFrame(data, columns=['File Name', 'Text Line'])
df.to_csv(os.path.join(output_dir_path,r'rtfData-3.csv'), escapechar="")
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论