从另一个数据框的条件下删除数据

huangapple go评论75阅读模式
英文:

Dropping data from Dataframe by condition of other dataframe

问题

我已经写了一个用两个for循环的代码来解决这个问题,但它执行起来太慢了:

import pandas as pd
import numpy as np
import time
import datetime

d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
    'value': [25.98, 81.31, 39.54, 12.34, 23.87],
    'outlet_id': [10,10,21,21,10]}
df = pd.DataFrame(data=d)

excelPath = "./Stackoverflow/runningtimes.xlsx"

excel_dfs = []
excel_dfs_index = []

dropped = 0

# examples // Original data comes from an excel sheet
d10 = {'outlet_id': [10, 10, 10],
        'Start Run': ['28.02.2023  13:00:00', '28.02.2023  14:00:00', '28.02.2023  20:30:00'],
        'End Run': ['28.02.2023  13:00:40', '28.02.2023  14:00:19', '28.02.2023  20:46:40']}

d21 = {'outlet_id': [21, 21, 21],
        'Start Run': ['28.02.2023  13:00:40', '28.02.2023  14:01:59', '28.02.2023  20:46:40'],
        'End Run': ['28.02.2023  13:00:50', '28.02.2023  14:02:09', '28.02.2023  20:51:40']}

df10 = pd.DataFrame(data=d10)
df21 = pd.DataFrame(data=d21)

print("DF Length before: " + str(len(df.index))

for rowIndex, row in df.iterrows():

    timestamp = row['ts']
    outlet_id = int(row['outlet_id'])

    try:
        if not outlet_id in excel_dfs_index:
            # excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
            if outlet_id == 10:
                excel_dfs.append(df10)
            elif outlet_id == 21:
                excel_dfs.append(df21)
            excel_dfs_index.append(outlet_id)

        localdf = excel_dfs[excel_dfs_index.index(outlet_id)]

        wasRunning = False

        for indexEX, rowEX in localdf.iterrows():
            
            startRunTS = time.mktime(datetime.datetime.strptime(str(rowEX['Start Run']), "%Y-%m-%d %H:%M:%S").timetuple()) * 1000
            endRunTS = time.mktime(datetime.datetime.strptime(str(rowEX['End Run']), "%Y-%m-%d %H:%M:%S").timetuple()) * 1000
                
            if (float(startRunTS) <= float(timestamp) <= float(endRunTS)):
                wasRunning = True
                break

        if wasRunning == False:
            df = df.drop(index=rowIndex, axis='rows')
            dropped += 1

    except:
        if not outlet_id in excel_dfs_index:
            print("outlet not found in excel file")
            excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
            excel_dfs_index.append(outlet_id)

print("DF Length after: " + str(len(df.index))
print("Dropped: " + str(dropped))

print(df)

有人有更高效的解决方案吗?

以下是更高效的解决方案:

import pandas as pd

# Sample dataframes
d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
    'value': [25.98, 81.31, 39.54, 12.34, 23.87],
    'outlet_id': [10, 10, 21, 21, 10]}
df = pd.DataFrame(data=d)

d10 = {'outlet_id': [10, 10, 10],
        'Start Run': ['28.02.2023  13:00:00', '28.02.2023  14:00:00', '28.02.2023  20:30:00'],
        'End Run': ['28.02.2023  13:00:40', '28.02.2023  14:00:19', '28.02.2023  20:46:40']}
d21 = {'outlet_id': [21, 21, 21],
        'Start Run': ['28.02.2023  13:00:40', '28.02.2023  14:01:59', '28.02.2023  20:46:40'],
        'End Run': ['28.02.2023  13:00:50', '28.02.2023  14:02:09', '28.02.2023  20:51:40']}
df10 = pd.DataFrame(data=d10)
df21 = pd.DataFrame(data=d21)

# Combine the outlet dataframes into a dictionary for quick access
outlet_dfs = {10: df10, 21: df21}

# Convert timestamps in the outlet dataframes to datetime objects
for outlet_id, outlet_df in outlet_dfs.items():
    outlet_df['Start Run'] = pd.to_datetime(outlet_df['Start Run'], format='%d.%m.%Y %H:%M:%S')
    outlet_df['End Run'] = pd.to_datetime(outlet_df['End Run'], format='%d.%m.%Y %H:%M:%S')

# Convert the 'ts' column in the main dataframe to datetime objects
df['ts'] = pd.to_datetime(df['ts'], unit='ms')

# Initialize a boolean mask for filtering
mask = []

# Iterate through rows of the main dataframe
for _, row in df.iterrows():
    outlet_id = row['outlet_id']
    timestamp = row['ts']
    outlet_df = outlet_dfs.get(outlet_id)

    if outlet_df is not None:
        # Check if the timestamp is within any time interval in the outlet dataframe
        mask.append(any((row['Start Run'] <= timestamp <= row['End Run']) for _, row in outlet_df.iterrows()))
    else:
        # If outlet data is not available, keep the row
        mask.append(True)

# Apply the boolean mask to filter the rows
filtered_df = df[mask]

print("Filtered DataFrame:")
print(filtered_df)

这个解决方案使用了Pandas的向量化操作,将主数据框和出口数据框中的时间

英文:

I have two pandas dataframes with timestamps. I want to drop dataframe rows if the timestamp of each is not between the start and end timestamp of the second dataframe. But I receive the dataframe from an excel worksheet.

e.g. first dataframe:

| no | timestamp | Value | outlet |
| -------- | -------- | -------- |
| 1 | 1677585630000 | 25.98 | 10 |
| 2 | 1677612900000 | 81.31 | 10 |
| 3 | 1677589319500 | 39.54 | 21 |
| 4 | 1677614000000 | 12.34 | 21 |
| 5 | 1677613900000 | 23.87 | 10 |

e.g. Excel worksheet for outlet 10 (as a own dataframe):

no Start Run End Run
1 28.02.2023 13:00:00 28.02.2023 13:00:40
2 28.02.2023 14:00:00 28.02.2023 14:00:19
3 28.02.2023 20:30:00 28.02.2023 20:46:40

So in this case the result should be:

| no | timestamp | Value | outlet |
| -------- | -------- | -------- |
| 1 | 1677585630000 | 25.98 | 10 |
| 2 | 1677612900000 | 23.87 | 10 |
| 3 | 1677589319500 | 39.54 | 21 |

I already wrote a code to solve the problem, but I did it with two for-loops and it tooks way to long to execute it efficiently:

import pandas as pd
import numpy as np
import time
import datetime
d = {&#39;ts&#39;: [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
&#39;value&#39;: [25.98, 81.31, 39.54, 12.34, 23.87],
&#39;outlet_id&#39;: [10,10,21,21,10]}
df = pd.DataFrame(data=d)
excelPath = &quot;./Stackoverflow/runningtimes.xlsx&quot;
excel_dfs = []
excel_dfs_index = []
dropped = 0
# examples // Original data comes from an excel sheet
d10 = {&#39;outlet_id&#39;: [10, 10, 10],
&#39;Start Run&#39;: [&#39;28.02.2023  13:00:00&#39;, &#39;28.02.2023  14:00:00&#39;, &#39;28.02.2023  20:30:00&#39;],
&#39;End Run&#39;: [&#39;28.02.2023  13:00:40&#39;, &#39;28.02.2023  14:00:19&#39;, &#39;28.02.2023  20:46:40&#39;]}
d21 = {&#39;outlet_id&#39;: [21, 21, 21],
&#39;Start Run&#39;: [&#39;28.02.2023  13:00:40&#39;, &#39;28.02.2023  14:01:59&#39;, &#39;28.02.2023  20:46:40&#39;],
&#39;End Run&#39;: [&#39;28.02.2023  13:00:50&#39;, &#39;28.02.2023  14:02:09&#39;, &#39;28.02.2023  20:51:40&#39;]}
df10 = pd.DataFrame(data=d10)
df21 = pd.DataFrame(data=d21)
print(&quot;DF Length before: &quot; + str(len(df.index)))
for rowIndex, row in df.iterrows():
timestamp = row[&#39;ts&#39;]
outlet_id = int(row[&#39;outlet_id&#39;])
try:
if not outlet_id in excel_dfs_index:
# excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
if outlet_id == 10:
excel_dfs.append(df10)
elif outlet_id == 21:
excel_dfs.append(df21)
excel_dfs_index.append(outlet_id)
localdf = excel_dfs[excel_dfs_index.index(outlet_id)]
wasRunning = False
for indexEX, rowEX in localdf.iterrows():
startRunTS = time.mktime(datetime.datetime.strptime(str(rowEX[&#39;Start Run&#39;]), &quot;%Y-%m-%d %H:%M:%S&quot;).timetuple()) * 1000
endRunTS = time.mktime(datetime.datetime.strptime(str(rowEX[&#39;End Run&#39;]), &quot;%Y-%m-%d %H:%M:%S&quot;).timetuple()) * 1000
if (float(startRunTS) &lt;= float(timestamp) &lt;= float(endRunTS)):
wasRunning = True
break
if wasRunning == False:
df = df.drop(index=rowIndex, axis=&#39;rows&#39;)
dropped += 1
except:
if not outlet_id in excel_dfs_index:
print(&quot;outlet not found in excel file&quot;)
excel_dfs.append(pd.read_excel(excelPath, sheet_name=str(outlet_id)))
excel_dfs_index.append(outlet_id)
print(&quot;DF Length after: &quot; + str(len(df.index)))
print(&quot;Dropped: &quot; + str(dropped))
print (df)

Has anyone a solution, that would be more efficient?

答案1

得分: 0

首先,您应确保您的时间戳保持一致(同时我编辑了df10的第一行,否则输出的数据框将为空)。然后,如果您构建一个用于查找您的excel数据框的字典,您可以使用我之前提供的函数进行轻微修改:

import pandas as pd
import numpy as np

d = {'ts': [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
    'value': [25.98, 81.31, 39.54, 12.34, 23.87],
    'outlet_id': [10,10,21,21,10]}
df = pd.DataFrame(data=d)
df['ts'] = pd.to_datetime(df['ts'], unit='ms')

# 例子 // 原始数据来自Excel表格
d10 = {'outlet_id': [10, 10, 10],
        'Start Run': ['28.02.2023  12:00:00', '28.02.2023  14:00:00', '28.02.2023  20:30:00'],
        'End Run': ['28.02.2023  12:00:40', '28.02.2023  14:00:19', '28.02.2023  20:46:40']}

d21 = {'outlet_id': [21, 21, 21],
        'Start Run': ['28.02.2023  13:00:40', '28.02.2023  14:01:59', '28.02.2023  20:46:40'],
        'End Run': ['28.02.2023  13:00:50', '28.02.2023  14:02:09', '28.02.2023  20:51:40']}

df10 = pd.DataFrame(data=d10)
df10[['Start Run', 'End Run']] = df10[['Start Run', 'End Run']].apply(pd.to_datetime)
df21 = pd.DataFrame(data=d21)
df21[['Start Run', 'End Run']] = df21[['Start Run', 'End Run']].apply(pd.to_datetime)

excel_dfs = {10: df10, 21: df21}    # 用于查找您的Excel数据框的查找字典
# 在您的实际情况下,您可以使用类似以下的方式构建它:
# {outlet_id: pd.read_excel(excelPath, sheet_name=str(outlet_id)) for outlet_id in df['outlet_id'].unique()}

print("DF在之前的长度: " + str(len(df.index)))

def is_between_start_and_end(row_df):
    ts = row_df['ts']
    df_xls = excel_dfs[row_df['outlet_id']]
    return any(row['Start Run'] <= ts <= row['End Run'] for _, row in df_xls.iterrows())

df = df[df.apply(is_between_start_and_end, axis=1)]

print("DF在之后的长度: " + str(len(df.index))
print(df)

输出:

                   ts  value  outlet_id
0 2023-02-28 12:00:30  25.98         10
英文:

Edit: adapting example after OP's edit

First you should make sure your timestamps are consistent (also I edited the first row of df10, otherwise the output df would be empty). Then if you build a dictionary to lookup your excel_dfs, you can use the funtion I previously submitted with a slight modification:

import pandas as pd
import numpy as np
d = {&#39;ts&#39;: [1677585630000, 1677612900000, 1677589319500, 1677614000000, 1677613900000],
&#39;value&#39;: [25.98, 81.31, 39.54, 12.34, 23.87],
&#39;outlet_id&#39;: [10,10,21,21,10]}
df = pd.DataFrame(data=d)
df[&#39;ts&#39;] = pd.to_datetime(df[&#39;ts&#39;], unit=&#39;ms&#39;)
# examples // Original data comes from an excel sheet
d10 = {&#39;outlet_id&#39;: [10, 10, 10],
&#39;Start Run&#39;: [&#39;28.02.2023  12:00:00&#39;, &#39;28.02.2023  14:00:00&#39;, &#39;28.02.2023  20:30:00&#39;],
&#39;End Run&#39;: [&#39;28.02.2023  12:00:40&#39;, &#39;28.02.2023  14:00:19&#39;, &#39;28.02.2023  20:46:40&#39;]}
d21 = {&#39;outlet_id&#39;: [21, 21, 21],
&#39;Start Run&#39;: [&#39;28.02.2023  13:00:40&#39;, &#39;28.02.2023  14:01:59&#39;, &#39;28.02.2023  20:46:40&#39;],
&#39;End Run&#39;: [&#39;28.02.2023  13:00:50&#39;, &#39;28.02.2023  14:02:09&#39;, &#39;28.02.2023  20:51:40&#39;]}
df10 = pd.DataFrame(data=d10)
df10[[&#39;Start Run&#39;, &#39;End Run&#39;]] = df10[[&#39;Start Run&#39;, &#39;End Run&#39;]].apply(pd.to_datetime)
df21 = pd.DataFrame(data=d21)
df21[[&#39;Start Run&#39;, &#39;End Run&#39;]] = df21[[&#39;Start Run&#39;, &#39;End Run&#39;]].apply(pd.to_datetime)
excel_dfs = {10: df10, 21: df21}    # lookup dictionary for your excel dfs
# in your real case you&#39;d build it with sth like:
# {outlet_id: pd.read_excel(excelPath, sheet_name=str(outlet_id)) for outlet_id in df[&#39;outlet_id&#39;].unique()}
print(&quot;DF Length before: &quot; + str(len(df.index)))
def is_between_start_and_end(row_df):
ts = row_df[&#39;ts&#39;]
df_xls = excel_dfs[row_df[&#39;outlet_id&#39;]]
return any(row[&#39;Start Run&#39;] &lt;= ts &lt;= row[&#39;End Run&#39;] for _, row in df_xls.iterrows())
df = df[df.apply(is_between_start_and_end, axis=1)]
print(&quot;DF Length after: &quot; + str(len(df.index)))
print(df)

Output:

                   ts  value  outlet_id
0 2023-02-28 12:00:30  25.98         10

huangapple
  • 本文由 发表于 2023年2月27日 19:09:34
  • 转载请务必保留本文链接:https://go.coder-hub.com/75579694.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定