英文:
Dropping data from Dataframe by condition of other dataframe
问题
我已经写了一个用两个for循环的代码来解决这个问题,但它执行起来太慢了:
import pandas as pd
import numpy as np
import time
import datetime
d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
'value': [25.98, 81.31, 39.54, 12.34, 23.87],
'outlet_id': [10,10,21,21,10]}
df = pd.DataFrame(data=d)
excelPath = "./Stackoverflow/runningtimes.xlsx"
excel_dfs = []
excel_dfs_index = []
dropped = 0
# examples // Original data comes from an excel sheet
d10 = {'outlet_id': [10, 10, 10],
'Start Run': ['28.02.2023 13:00:00', '28.02.2023 14:00:00', '28.02.2023 20:30:00'],
'End Run': ['28.02.2023 13:00:40', '28.02.2023 14:00:19', '28.02.2023 20:46:40']}
d21 = {'outlet_id': [21, 21, 21],
'Start Run': ['28.02.2023 13:00:40', '28.02.2023 14:01:59', '28.02.2023 20:46:40'],
'End Run': ['28.02.2023 13:00:50', '28.02.2023 14:02:09', '28.02.2023 20:51:40']}
df10 = pd.DataFrame(data=d10)
df21 = pd.DataFrame(data=d21)
print("DF Length before: " + str(len(df.index))
for rowIndex, row in df.iterrows():
timestamp = row['ts']
outlet_id = int(row['outlet_id'])
try:
if not outlet_id in excel_dfs_index:
# excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
if outlet_id == 10:
excel_dfs.append(df10)
elif outlet_id == 21:
excel_dfs.append(df21)
excel_dfs_index.append(outlet_id)
localdf = excel_dfs[excel_dfs_index.index(outlet_id)]
wasRunning = False
for indexEX, rowEX in localdf.iterrows():
startRunTS = time.mktime(datetime.datetime.strptime(str(rowEX['Start Run']), "%Y-%m-%d %H:%M:%S").timetuple()) * 1000
endRunTS = time.mktime(datetime.datetime.strptime(str(rowEX['End Run']), "%Y-%m-%d %H:%M:%S").timetuple()) * 1000
if (float(startRunTS) <= float(timestamp) <= float(endRunTS)):
wasRunning = True
break
if wasRunning == False:
df = df.drop(index=rowIndex, axis='rows')
dropped += 1
except:
if not outlet_id in excel_dfs_index:
print("outlet not found in excel file")
excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
excel_dfs_index.append(outlet_id)
print("DF Length after: " + str(len(df.index))
print("Dropped: " + str(dropped))
print(df)
有人有更高效的解决方案吗?
以下是更高效的解决方案:
import pandas as pd
# Sample dataframes
d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
'value': [25.98, 81.31, 39.54, 12.34, 23.87],
'outlet_id': [10, 10, 21, 21, 10]}
df = pd.DataFrame(data=d)
d10 = {'outlet_id': [10, 10, 10],
'Start Run': ['28.02.2023 13:00:00', '28.02.2023 14:00:00', '28.02.2023 20:30:00'],
'End Run': ['28.02.2023 13:00:40', '28.02.2023 14:00:19', '28.02.2023 20:46:40']}
d21 = {'outlet_id': [21, 21, 21],
'Start Run': ['28.02.2023 13:00:40', '28.02.2023 14:01:59', '28.02.2023 20:46:40'],
'End Run': ['28.02.2023 13:00:50', '28.02.2023 14:02:09', '28.02.2023 20:51:40']}
df10 = pd.DataFrame(data=d10)
df21 = pd.DataFrame(data=d21)
# Combine the outlet dataframes into a dictionary for quick access
outlet_dfs = {10: df10, 21: df21}
# Convert timestamps in the outlet dataframes to datetime objects
for outlet_id, outlet_df in outlet_dfs.items():
outlet_df['Start Run'] = pd.to_datetime(outlet_df['Start Run'], format='%d.%m.%Y %H:%M:%S')
outlet_df['End Run'] = pd.to_datetime(outlet_df['End Run'], format='%d.%m.%Y %H:%M:%S')
# Convert the 'ts' column in the main dataframe to datetime objects
df['ts'] = pd.to_datetime(df['ts'], unit='ms')
# Initialize a boolean mask for filtering
mask = []
# Iterate through rows of the main dataframe
for _, row in df.iterrows():
outlet_id = row['outlet_id']
timestamp = row['ts']
outlet_df = outlet_dfs.get(outlet_id)
if outlet_df is not None:
# Check if the timestamp is within any time interval in the outlet dataframe
mask.append(any((row['Start Run'] <= timestamp <= row['End Run']) for _, row in outlet_df.iterrows()))
else:
# If outlet data is not available, keep the row
mask.append(True)
# Apply the boolean mask to filter the rows
filtered_df = df[mask]
print("Filtered DataFrame:")
print(filtered_df)
这个解决方案使用了Pandas的向量化操作,将主数据框和出口数据框中的时间
英文:
I have two pandas dataframes with timestamps. I want to drop dataframe rows if the timestamp of each is not between the start and end timestamp of the second dataframe. But I receive the dataframe from an excel worksheet.
e.g. first dataframe:
| no | timestamp | Value | outlet |
| -------- | -------- | -------- |
| 1 | 1677585630000 | 25.98 | 10 |
| 2 | 1677612900000 | 81.31 | 10 |
| 3 | 1677589319500 | 39.54 | 21 |
| 4 | 1677614000000 | 12.34 | 21 |
| 5 | 1677613900000 | 23.87 | 10 |
e.g. Excel worksheet for outlet 10 (as a own dataframe):
no | Start Run | End Run |
---|---|---|
1 | 28.02.2023 13:00:00 | 28.02.2023 13:00:40 |
2 | 28.02.2023 14:00:00 | 28.02.2023 14:00:19 |
3 | 28.02.2023 20:30:00 | 28.02.2023 20:46:40 |
So in this case the result should be:
| no | timestamp | Value | outlet |
| -------- | -------- | -------- |
| 1 | 1677585630000 | 25.98 | 10 |
| 2 | 1677612900000 | 23.87 | 10 |
| 3 | 1677589319500 | 39.54 | 21 |
I already wrote a code to solve the problem, but I did it with two for-loops and it tooks way to long to execute it efficiently:
import pandas as pd
import numpy as np
import time
import datetime
d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
'value': [25.98, 81.31, 39.54, 12.34, 23.87],
'outlet_id': [10,10,21,21,10]}
df = pd.DataFrame(data=d)
excelPath = "./Stackoverflow/runningtimes.xlsx"
excel_dfs = []
excel_dfs_index = []
dropped = 0
# examples // Original data comes from an excel sheet
d10 = {'outlet_id': [10, 10, 10],
'Start Run': ['28.02.2023 13:00:00', '28.02.2023 14:00:00', '28.02.2023 20:30:00'],
'End Run': ['28.02.2023 13:00:40', '28.02.2023 14:00:19', '28.02.2023 20:46:40']}
d21 = {'outlet_id': [21, 21, 21],
'Start Run': ['28.02.2023 13:00:40', '28.02.2023 14:01:59', '28.02.2023 20:46:40'],
'End Run': ['28.02.2023 13:00:50', '28.02.2023 14:02:09', '28.02.2023 20:51:40']}
df10 = pd.DataFrame(data=d10)
df21 = pd.DataFrame(data=d21)
print("DF Length before: " + str(len(df.index)))
for rowIndex, row in df.iterrows():
timestamp = row['ts']
outlet_id = int(row['outlet_id'])
try:
if not outlet_id in excel_dfs_index:
# excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
if outlet_id == 10:
excel_dfs.append(df10)
elif outlet_id == 21:
excel_dfs.append(df21)
excel_dfs_index.append(outlet_id)
localdf = excel_dfs[excel_dfs_index.index(outlet_id)]
wasRunning = False
for indexEX, rowEX in localdf.iterrows():
startRunTS = time.mktime(datetime.datetime.strptime(str(rowEX['Start Run']), "%Y-%m-%d %H:%M:%S").timetuple()) * 1000
endRunTS = time.mktime(datetime.datetime.strptime(str(rowEX['End Run']), "%Y-%m-%d %H:%M:%S").timetuple()) * 1000
if (float(startRunTS) <= float(timestamp) <= float(endRunTS)):
wasRunning = True
break
if wasRunning == False:
df = df.drop(index=rowIndex, axis='rows')
dropped += 1
except:
if not outlet_id in excel_dfs_index:
print("outlet not found in excel file")
excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
excel_dfs_index.append(outlet_id)
print("DF Length after: " + str(len(df.index)))
print("Dropped: " + str(dropped))
print (df)
Has anyone a solution, that would be more efficient?
答案1
得分: 0
首先,您应确保您的时间戳保持一致(同时我编辑了df10
的第一行,否则输出的数据框将为空)。然后,如果您构建一个用于查找您的excel数据框的字典,您可以使用我之前提供的函数进行轻微修改:
import pandas as pd
import numpy as np
d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
'value': [25.98, 81.31, 39.54, 12.34, 23.87],
'outlet_id': [10,10,21,21,10]}
df = pd.DataFrame(data=d)
df['ts'] = pd.to_datetime(df['ts'], unit='ms')
# 例子 // 原始数据来自Excel表格
d10 = {'outlet_id': [10, 10, 10],
'Start Run': ['28.02.2023 12:00:00', '28.02.2023 14:00:00', '28.02.2023 20:30:00'],
'End Run': ['28.02.2023 12:00:40', '28.02.2023 14:00:19', '28.02.2023 20:46:40']}
d21 = {'outlet_id': [21, 21, 21],
'Start Run': ['28.02.2023 13:00:40', '28.02.2023 14:01:59', '28.02.2023 20:46:40'],
'End Run': ['28.02.2023 13:00:50', '28.02.2023 14:02:09', '28.02.2023 20:51:40']}
df10 = pd.DataFrame(data=d10)
df10[['Start Run', 'End Run']] = df10[['Start Run', 'End Run']].apply(pd.to_datetime)
df21 = pd.DataFrame(data=d21)
df21[['Start Run', 'End Run']] = df21[['Start Run', 'End Run']].apply(pd.to_datetime)
excel_dfs = {10: df10, 21: df21} # 用于查找您的Excel数据框的查找字典
# 在您的实际情况下,您可以使用类似以下的方式构建它:
# {outlet_id: pd.read_excel(excelPath, sheet_name=str(outlet_id)) for outlet_id in df['outlet_id'].unique()}
print("DF在之前的长度: " + str(len(df.index)))
def is_between_start_and_end(row_df):
ts = row_df['ts']
df_xls = excel_dfs[row_df['outlet_id']]
return any(row['Start Run'] <= ts <= row['End Run'] for _, row in df_xls.iterrows())
df = df[df.apply(is_between_start_and_end, axis=1)]
print("DF在之后的长度: " + str(len(df.index))
print(df)
输出:
ts value outlet_id
0 2023-02-28 12:00:30 25.98 10
英文:
Edit: adapting example after OP's edit
First you should make sure your timestamps are consistent (also I edited the first row of df10
, otherwise the output df would be empty). Then if you build a dictionary to lookup your excel_dfs, you can use the funtion I previously submitted with a slight modification:
import pandas as pd
import numpy as np
d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
'value': [25.98, 81.31, 39.54, 12.34, 23.87],
'outlet_id': [10,10,21,21,10]}
df = pd.DataFrame(data=d)
df['ts'] = pd.to_datetime(df['ts'], unit='ms')
# examples // Original data comes from an excel sheet
d10 = {'outlet_id': [10, 10, 10],
'Start Run': ['28.02.2023 12:00:00', '28.02.2023 14:00:00', '28.02.2023 20:30:00'],
'End Run': ['28.02.2023 12:00:40', '28.02.2023 14:00:19', '28.02.2023 20:46:40']}
d21 = {'outlet_id': [21, 21, 21],
'Start Run': ['28.02.2023 13:00:40', '28.02.2023 14:01:59', '28.02.2023 20:46:40'],
'End Run': ['28.02.2023 13:00:50', '28.02.2023 14:02:09', '28.02.2023 20:51:40']}
df10 = pd.DataFrame(data=d10)
df10[['Start Run', 'End Run']] = df10[['Start Run', 'End Run']].apply(pd.to_datetime)
df21 = pd.DataFrame(data=d21)
df21[['Start Run', 'End Run']] = df21[['Start Run', 'End Run']].apply(pd.to_datetime)
excel_dfs = {10: df10, 21: df21} # lookup dictionary for your excel dfs
# in your real case you'd build it with sth like:
# {outlet_id: pd.read_excel(excelPath, sheet_name=str(outlet_id)) for outlet_id in df['outlet_id'].unique()}
print("DF Length before: " + str(len(df.index)))
def is_between_start_and_end(row_df):
ts = row_df['ts']
df_xls = excel_dfs[row_df['outlet_id']]
return any(row['Start Run'] <= ts <= row['End Run'] for _, row in df_xls.iterrows())
df = df[df.apply(is_between_start_and_end, axis=1)]
print("DF Length after: " + str(len(df.index)))
print(df)
Output:
ts value outlet_id
0 2023-02-28 12:00:30 25.98 10
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论