Python脚本将文件夹中的RTF文件合并为CSV,其中文件名作为单独的列。

huangapple go评论65阅读模式
英文:

Python script to combine RTF files in a folder to CSV with filename as a separate column

问题

I've tried a plethora of solutions which don't work. I've figured out how to get the converted data into a csv column, but all text is in 1 cell and I haven't figured out how to get the FileNames added as a column.

import pandas as pd
import os
from striprtf.striprtf import rtf_to_text

dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'

def getFiles():
    list = []
    FileNames = []
    for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path, path)):
            with open(os.path.join(dir_path, path)) as file:
                text = file.read()
                rtfText = rtf_to_text(text, encoding='utf-8')
                for text in rtfText:
                    list.append(rtfText)
                    FileNames.append(os.path.basename(rtfText))
    return list, FileNames

list, FileNames = getFiles()

Data = pd.DataFrame({'FileNames': FileNames, 'Data': list})

NewPath = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'
Data.to_csv(os.path.join(NewPath, r'Data.csv'), index=False, header=False)

I've tried for a few days to scrape Stackoverflow and find a solution and now I seem to be getting duplicate file data in each row?

I think my main issues are. Possible that I need to create an empty dataframe before the function, but I haven't got it working yet.

  • separating the text so each new line is a new cell
  • Distinct file content so there aren't duplicate rows
  • adding the FileNames.

Hopefully, the outcome looks like this...

Filename Data
Filename_1 Data line 1
Filename_1 Data line 2
Filename_2 Data line 1
Filename_2 Data line 2
Filename_2 Data line 3

Thank you for any help Python脚本将文件夹中的RTF文件合并为CSV,其中文件名作为单独的列。

英文:

I've tried a plethora of solutions which don't work. I've figured out how to get the converted data into a csv column, but all text is in 1 cell and I haven't figured out how to get the FileNames added as a column.

import pandas as pd
import os
from striprtf.striprtf import rtf_to_text

dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'


def getFiles():
    list = []
    FileNames = []
    for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path,path)):
            with open (os.path.join(dir_path,path)) as file:
                    text = file.read()
                    rtfText = rtf_to_text(text,encoding='utf-8')
                    for text in rtfText:
                        list.append(rtfText) 
                        FileNames.append(os.path.basename(rtfText))
    return list
    return FileNames

list = getFiles()
FileNames = getFiles()


Data = pd.DataFrame(columns: 'list','FileNames')



NewPath = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'
Data.to_csv(os.path.join(NewPath,r'Data.csv'), index = False, header = False)

I've tried for a few days to scrape Stackoverflow and find a solution and now I seem to be getting duplicate file data in each row ?

I think my main issues are. Possible that I need to create an empty dataframe before the function, but I haven't got it working yet.

  • separating the text so each new line is a new cell
  • Distinct file content so there aren't duplicate rows
  • adding the FileNames.

Hopefully, the outcome looks like this...

Filename Data
Filename_1 Data line 1
Filename_1 Data line 2
Filename_2 Data line 1
Filename_2 Data line 2
Filename_2 Data line 3

Thank you for any help Python脚本将文件夹中的RTF文件合并为CSV,其中文件名作为单独的列。

答案1

得分: 0

import pandas as pd
import os
from striprtf.striprtf import rtf_to_text

数据结构

class fileLine:
def init(self, fileName, textLine):
self.fileName = fileName
self.textLine = textLine

数据文件目录

data_dir_path = 'C:\Users\mairi\Desktop\testing txt to excel\'
output_dir_path = 'C:\Users\mairi\Desktop\testing txt to excel\NEW\'

获取给定目录中所有RTF文件的所有行

def getRTFLines():
list = []
# 对于给定目录中的每个文件
for filename in os.listdir(data_dir_path):
filePath = os.path.join(data_dir_path, filename)
# 检查它是否是RTF文件
if os.path.isfile(filePath) and filePath.endswith('.rtf'):
# 打开RTF文件
with open(filePath, encoding='utf-8', errors='ignore') as file:
# 读取RTF文件的所有内容
text = file.read()
# 解码RTF文件的内容
rtfText = rtf_to_text(text, encoding='utf-8')
# 对于RTF文件中的每一行
for line in rtfText.splitlines():
# 检查它是否为空行
if line:
# 将行插入到我们的数据结构中
list.append(fileLine(filename, line))
return list

读取数据

list = getRTFLines()

将数据转换为CSV文件

data = [[x.fileName, x.textLine] for x in list]
df = pd.DataFrame(data, columns=['文件名', '文本行'])
df.to_csv(os.path.join(output_dir_path, r'rtfData-3.csv'), escapechar="")

英文:

A friend helped me solve this pickle Python脚本将文件夹中的RTF文件合并为CSV,其中文件名作为单独的列。

I was converting and saving all text in the document instead of stripping it line by line.

import pandas as pd
import os
from striprtf.striprtf import rtf_to_text

# Data structure
class fileLine:
    def __init__(self, fileName, textLine):
        self.fileName = fileName
        self.textLine = textLine

# Data files directories
data_dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\'
output_dir_path = 'C:\\Users\\mairi\\Desktop\\testing txt to excel\\NEW\\'

# Get all lines of all RTF files in a giver directory
def getRTFLines():
    list = []
    # For each file in the given directory
    for filename in os.listdir(data_dir_path):
        filePath = os.path.join(data_dir_path, filename)
        # Check if it's an RTF file
        if os.path.isfile(filePath) and filePath.endswith('.rtf'):
            # Open the RTF file
            with open(filePath, encoding='utf-8', errors='ignore') as file:
                # Read all the RTF file's content
                text = file.read()
                # Decode the RTF file's content
                rtfText = rtf_to_text(text, encoding='utf-8')
                # For each line in the RTF file
                for line in rtfText.splitlines():
                    # Check if it's an empty line
                    if line:
                        # Insert the line in our data structure
                        list.append(fileLine(filename, line))
    return list

# Read data
list = getRTFLines()

# Convert data to csv file
data = [[x.fileName, x.textLine] for x in list]
df = pd.DataFrame(data, columns=['File Name', 'Text Line'])
df.to_csv(os.path.join(output_dir_path,r'rtfData-3.csv'), escapechar="")

huangapple
  • 本文由 发表于 2023年6月1日 00:18:15
  • 转载请务必保留本文链接:https://go.coder-hub.com/76375514.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定