英文:
Function to identify duplicate Python column names and add specific suffixes
问题
def change_colnames(df, suffixes):
new_columns = []
seen_columns = {}
for column in df.columns:
match = re.match(r'^(.*?)(?:\.\d+)?$', column) # 匹配基本列名和可选后缀
base_column = match.group(1) if match else column # 获取基本列名或保持原始列名
if base_column in seen_columns:
idx = seen_columns[base_column] # 获取基本列的索引
new_column = f"{base_column} - {suffixes[idx]}" # 添加新后缀
seen_columns[base_column] += 1 # 为下一次出现增加索引
else:
new_column = base_column
seen_columns[base_column] = 0 # 使用索引0添加基本列
new_columns.append(new_column)
df.columns = new_columns
return df
英文:
I have several dataframes with certain duplicate column names (they come from Excel files). My data looks a little something like this.
original_df= pd.DataFrame({
'ID': [True, False, True],
'Revenue (USDm)': [1000, 2000, 1500],
'Location': ['London', 'New York', 'Paris'],
'Year': [2021, 2022, 2023],
'Sold Products': [10, 20, 30],
'Leased Products': [5, 10, 15],
'Investments': [7, 12, 8],
'Sold Products.1': [15, 25, 35],
'Leased Products.1': [8, 12, 16],
'Investments.1': [6, 9, 11],
'Sold Products.2': [5, 10, 15],
'Leased Products.2': [2, 5, 8],
'Investments.2': [3, 7, 4],
'QC Completed?': [True, True, False],
})
When I read the df, pandas automatically adds the .1
and .2
suffixes to the duplicate column names. I tried to write a function that identifies the duplicates and adds a new set of suffixes from a list I provide, while removing the .1
and .2
where applicable.
The new suffixes list is suffixes = ['Vehicles','Electronics','Real Estate']
The output should look like this:
desired_output = pd.DataFrame({
'ID': [True, False, True],
'Revenue (USDm)': [1000, 2000, 1500],
'Location': ['London', 'New York', 'Paris'],
'Year': [2021, 2022, 2023],
'Sold Products - Vehicles': [10, 20, 30],
'Leased Products - Vehicles': [5, 10, 15],
'Investments - Vehicles': [7, 12, 8],
'Sold Products - Electronics': [15, 25, 35],
'Leased Products - Electronics': [8, 12, 16],
'Investments - Electronics': [6, 9, 11],
'Sold Products - Real Estate': [5, 10, 15],
'Leased Products - Real Estate': [2, 5, 8],
'Investments - Real Estate': [3, 7, 4],
'QC Completed?': [True, True, False],
})
The column names without any duplicates should remain the same but the columns which are duplicated get added the suffixes in order; If they also have the .1
and .2
suffixes, those get removed.
My function is below:
def change_colnames(df, suffixes):
new_columns = []
seen_columns = {}
for column in df.columns:
match = re.match(r'^(.*?)(?:\.\d+)?$', column) # Match the base column name and optional suffix
base_column = match.group(1) if match else column # Get the base column name or keep the original column name
if base_column in seen_columns:
idx = seen_columns[base_column] # Get the index of the base column
new_column = f"{base_column} {suffixes[idx]}" # Append the new suffix
seen_columns[base_column] += 1 # Increment the index for the next occurrence
else:
new_column = base_column
seen_columns[base_column] = 0 # Add the base column with index 0
new_columns.append(new_column)
df.columns = new_columns
return df
Unfortunately the first set of duplicate columns (those without the .1
and .2
suffixes) stays the same. The output I get is this:
wrong_output = pd.DataFrame({
'ID': [True, False, True],
'Revenue (USDm)': [1000, 2000, 1500],
'Location': ['London', 'New York', 'Paris'],
'Year': [2021, 2022, 2023],
'Sold Products': [10, 20, 30],
'Leased Products': [5, 10, 15],
'Investments': [7, 12, 8],
'Sold Products - Vehicles': [15, 25, 35],
'Leased Products - Vehicles': [8, 12, 16],
'Investments - Vehicles': [6, 9, 11],
'Sold Products - Electronics': [5, 10, 15],
'Leased Products - Electronics': [2, 5, 8],
'Investments - Electronics': [3, 7, 4],
'QC Completed?': [True, True, False],
})
Any idea how to fix it?
答案1
得分: 1
使用enumerate
创建字典,并通过GroupBy.cumcount
将计数器中的重复值映射:
suffixes = ['Vehicles', 'Electronics', 'Real Estate']
d = dict(enumerate(suffixes))
s = original_df.columns.to_series()
new = s.str.replace(r'\.\d+$', '', regex=True)
mapped = (new.groupby(new).cumcount()
.where(new.duplicated(keep=False)).map(d)
.radd(' - ').fillna(''))
original_df.columns = new + mapped
print (original_df)
ID Revenue (USDm) Location Year Sold Products - Vehicles \
0 True 1000 London 2021 10
1 False 2000 New York 2022 20
2 True 1500 Paris 2023 30
Leased Products - Vehicles Investments - Vehicles \
0 5 7
1 10 12
2 15 8
Sold Products - Electronics Leased Products - Electronics \
0 15 8
1 25 12
2 35 16
Investments - Electronics Sold Products - Real Estate \
0 6 5
1 9 10
2 11 15
Leased Products - Real Estate Investments - Real Estate QC Completed?
0 2 3 True
1 5 7 True
2 8 4 False
英文:
Create dictionary by enumerate
and mapping duplicated values in counter by GroupBy.cumcount
:
suffixes = ['Vehicles','Electronics','Real Estate']
d = dict(enumerate(suffixes))
s = original_df.columns.to_series()
new = s.str.replace(r'\.\d+$','', regex=True)
mapped = (new.groupby(new).cumcount()
.where(new.duplicated(keep=False)).map(d)
.radd(' - ').fillna(''))
original_df.columns = new + mapped
print (original_df)
ID Revenue (USDm) Location Year Sold Products - Vehicles \
0 True 1000 London 2021 10
1 False 2000 New York 2022 20
2 True 1500 Paris 2023 30
Leased Products - Vehicles Investments - Vehicles \
0 5 7
1 10 12
2 15 8
Sold Products - Electronics Leased Products - Electronics \
0 15 8
1 25 12
2 35 16
Investments - Electronics Sold Products - Real Estate \
0 6 5
1 9 10
2 11 15
Leased Products - Real Estate Investments - Real Estate QC Completed?
0 2 3 True
1 5 7 True
2 8 4 False
答案2
得分: 1
你可以使用正则表达式和 str.replace
进行处理,这里还有一个用于灵活性的自定义函数:
import re
# 识别重复列
dup_cols = original_df.filter(regex=r'\.\d+$').columns
# 获取重复列的基本名称
base = dup_cols.str.replace(r'\.\d+$', '').unique()
# ['Sold Products', 'Leased Products', 'Investments']
# 创建模式
pattern = fr"^({'|'.join(map(re.escape, base))})(\.\d+)?$"
# '^(Sold\\ Products|Leased\\ Products|Investments)(\\.\\d+)?$'
suffixes = ['Vehicles', 'Electronics', 'Real Estate']
dic = dict(enumerate(suffixes))
def f(m):
suffix = m.group(2)
if suffix:
suffix = dic.get(int(suffix[1:]), '')
else:
suffix = dic[0]
return m.group(1) + ' - ' + suffix
# 基于模式进行替换
original_df.columns = original_df.columns.str.replace(pattern, f, regex=True)
输出:
ID Revenue (USDm) Location Year Sold Products - Vehicles Leased Products - Vehicles Investments - Vehicles Sold Products - Electronics Leased Products - Electronics \
0 True 1000 London 2021 10 5 7 15 8
1 False 2000 New York 2022 20 10 12 25 12
2 True 1500 Paris 2023 30 15 8 35 16
Investments - Electronics Sold Products - Real Estate Leased Products - Real Estate Investments - Real Estate QC Completed?
0 6 5 2 3 True
1 9 10 5 7 True
2 11 15 8 4 False
英文:
You could use a regex for that and str.replace
, here with a custom function for flexibility:
import re
# identify duplicated columns
dup_cols = original_df.filter(regex=r'\.\d+$').columns
# get the base names of the duplicates
base = dup_cols.str.replace(r'\.\d+$', '').unique()
# ['Sold Products', 'Leased Products', 'Investments']
# craft a pattern
pattern = fr"^({'|'.join(map(re.escape, base))})(\.\d+)?$"
# '^(Sold\\ Products|Leased\\ Products|Investments)(\\.\\d+)?$'
suffixes = ['Vehicles','Electronics','Real Estate']
dic = dict(enumerate(suffixes))
def f(m):
suffix = m.group(2)
if suffix:
suffix = dic.get(int(suffix[1:]), '')
else:
suffix = dic[0]
return m.group(1) + ' - ' + suffix
# replace based on pattern
original_df.columns = original_df.columns.str.replace(pattern, f, regex=True)
Output:
ID Revenue (USDm) Location Year Sold Products - Vehicles Leased Products - Vehicles Investments - Vehicles Sold Products - Electronics Leased Products - Electronics \
0 True 1000 London 2021 10 5 7 15 8
1 False 2000 New York 2022 20 10 12 25 12
2 True 1500 Paris 2023 30 15 8 35 16
Investments - Electronics Sold Products - Real Estate Leased Products - Real Estate Investments - Real Estate QC Completed?
0 6 5 2 3 True
1 9 10 5 7 True
2 11 15 8 4 False
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论