英文:
Fill gaps in time intervals with other time intervals
问题
我们有两个带有时间间隔的表格。我想要填补df1中的间隙,使用df2,就像图中所示得到df3。df1保持不变,只有df2中与df1间隙(差异)重叠的部分被移到df3中。
import pandas as pd
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15', '2023-04-18', '2023-05-15', '2023-05-25'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01', '2023-05-03', '2023-05-20', '2023-05-30']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05', '2023-04-18', '2023-05-12'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15', '2023-05-10', '2023-06-05']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02', '2023-04-18', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-21', '2023-05-25', '2023-05-31'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15', '2023-05-03', '2023-05-10', '2023-05-14', '2023-05-20', '2023-05-24', '2023-05-30', '2023-06-05']})
# 你的代码部分
import plotly.express as px
df_plot = pd.concat(
[
df1.assign(color='df1', df='df1'),
df2.assign(color='df2', df='df2'),
df3.assign(color=['df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2'], df='df3')
],
)
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
英文:
We have two tables with time intervals. I want to fill gaps in df1 with df2 as in the graph to get df3. df1 is moved to df3 as it is, and only the parts of df2 that lie in the gaps of df1 (difference) are moved to df3.
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15', '2023-04-18', '2023-05-15', '2023-05-25'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01', '2023-05-03', '2023-05-20', '2023-05-30']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05', '2023-04-18', '2023-05-12'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15', '2023-05-10', '2023-06-05']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02', '2023-04-18', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-21', '2023-05-25', '2023-05-31'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15', '2023-05-03', '2023-05-10', '2023-05-14', '2023-05-20', '2023-05-24', '2023-05-30', '2023-06-05']})
# df1
Start End
0 2023-01-01 2023-01-15
1 2023-02-01 2023-02-20
2 2023-03-15 2023-04-01
3 2023-04-18 2023-05-03
4 2023-05-15 2023-05-20
5 2023-05-25 2023-05-30
# df2
Start End
0 2023-01-02 2023-01-03
1 2023-01-05 2023-01-10
2 2023-01-20 2023-02-10
3 2023-02-25 2023-03-01
4 2023-03-05 2023-04-15
5 2023-04-18 2023-05-10
6 2023-05-12 2023-06-05
# df3 (desired result)
Start End
0 2023-01-01 2023-01-15
1 2023-01-20 2023-01-31
2 2023-02-01 2023-02-20
3 2023-02-25 2023-03-01
4 2023-03-05 2023-03-14
5 2023-03-15 2023-04-01
6 2023-04-02 2023-04-15
7 2023-04-18 2023-05-03
8 2023-05-04 2023-05-10
9 2023-05-12 2023-05-14
10 2023-05-15 2023-05-20
11 2023-05-21 2023-05-24
12 2023-05-25 2023-05-30
13 2023-05-31 2023-06-05
Code to generate plot:
import plotly.express as px
df_plot = pd.concat(
[
df1.assign(color='df1', df='df1'),
df2.assign(color='df2', df='df2'),
df3.assign(color=['df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2'], df='df3')
],
)
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
答案1
得分: 3
我认为我可以让你接近:
df1 = pd.DataFrame({'开始': ['2023-01-01', '2023-02-01', '2023-03-15'],
'结束': ['2023-01-15', '2023-02-20', '2023-04-01']})
df2 = pd.DataFrame({'开始': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05'],
'结束': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15']})
df3 = pd.DataFrame({'开始': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02'],
'结束': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15']})
df1['日期'] = [pd.date_range(s,e) for s, e in zip(df1['开始'], df1['结束'])]
df2['日期'] = [pd.date_range(s,e) for s, e in zip(df2['开始'], df2['结束'])]
df1e = df1.explode('日期').assign(来源='df1')
df2e = df2.explode('日期').assign(来源='df2')
df3e = df1e.set_index(df1e['日期']).combine_first(df2e.set_index(df2e['日期']))
df3e['日期'] = pd.to_datetime(df3e['日期'])
df3e['分组'] = ((df3e['来源'] != df3e['来源'].shift()) |
(df3e['日期'] - df3e['日期'].shift() > pd.Timedelta(days=1))).cumsum()
df_out = df3e.groupby(['分组', '来源'])['日期'].agg([min, max])
英文:
I think I can get you close:
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15']})
df1['dates'] = [pd.date_range(s,e) for s, e in zip(df1['Start'], df1['End'])]
df2['dates'] = [pd.date_range(s,e) for s, e in zip(df2['Start'], df2['End'])]
df1e = df1.explode('dates').assign(source='df1')
df2e = df2.explode('dates').assign(source='df2')
df3e = df1e.set_index(df1e['dates']).combine_first(df2e.set_index(df2e['dates']))
df3e['dates'] = pd.to_datetime(df3e['dates'])
df3e['group'] = ((df3e['source'] != df3e['source'].shift()) |
(df3e['dates'] - df3e['dates'].shift() > pd.Timedelta(days=1))).cumsum()
df_out = df3e.groupby(['group', 'source'])['dates'].agg([min, max])
Output:
min max
group source
1 df1 2023-01-01 2023-01-15
2 df2 2023-01-20 2023-01-31
3 df1 2023-02-01 2023-02-20
4 df2 2023-02-25 2023-03-01
5 df2 2023-03-05 2023-03-14
6 df1 2023-03-15 2023-04-01
7 df2 2023-04-02 2023-04-15
Graphical Output:
import plotly.express as px
df_out = df_out.reset_index().rename({'source':'color', 'min':'Start', 'max':'End'}, axis=1)
df_plot = pd.concat(
[
df1.assign(color='df1'),
df2.assign(color='df2'),
df_out
],
keys=['df1' , 'df2', 'df3']
).reset_index(level=0, names='df')
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
Graph:
with updated dataset:
min max
group source
1 df1 2023-01-01 2023-01-15
2 df2 2023-01-20 2023-01-31
3 df1 2023-02-01 2023-02-20
4 df2 2023-02-25 2023-03-01
5 df2 2023-03-05 2023-03-14
6 df1 2023-03-15 2023-04-01
7 df2 2023-04-02 2023-04-15
8 df1 2023-04-18 2023-05-03
9 df2 2023-05-04 2023-05-10
10 df2 2023-05-12 2023-05-14
11 df1 2023-05-15 2023-05-20
12 df2 2023-05-21 2023-05-24
13 df1 2023-05-25 2023-05-30
14 df2 2023-05-31 2023-06-05
Graph output:
答案2
得分: 2
以下是代码部分的翻译:
tmp = (pd.concat([df1, df2], keys=['ref', 'other']).rename_axis(index=['origin', 'index'])
.sort_values(by=['Start', 'End']).astype('datetime64').reset_index()
.assign(ref_next_start=lambda d: d['Start'].where(d['origin']=='ref').bfill(),
ref_prev_end=lambda d: d['End'].where(d['origin']=='ref').ffill(),
other_prev_end=lambda d: d['End'].where(d['origin']=='other').ffill(),
)
)
m = tmp['origin'].eq('ref')
out = pd.concat(
[(tmp.loc[tmp['End'].ge(tmp['ref_prev_end'])]
.assign(End=lambda d: d['End'].mask(d['origin'].shift(-1).eq('ref'),
d['ref_next_start']-pd.Timedelta("1d")))
)[['origin', 'Start', 'End']],
(tmp.loc[m & tmp['End'].le(tmp['other_prev_end']), ['End', 'other_prev_end']]
.rename(columns={'End': 'Start', 'other_prev_end': 'End'})
.assign(Start=lambda d: d['Start']+pd.Timedelta("1d"), origin='other')
)]
).sort_values(by=['Start', 'End'], ignore_index=True)
tmp = pd.concat([df1, df2], keys=['df1', 'df2'], names=['origin']).reset_index(level=0)
tmp['Start'] = pd.to_datetime(tmp['Start'])
tmp['End'] = pd.to_datetime(tmp['End'])
out = (tmp
.sort_values(by=['Start', 'End'], ignore_index=True)
.assign(m=lambda d: d['origin'].eq('df1'),
Start=lambda d: d['Start'].mask((~d['m']) & d['Start'].lt(s:=d['End'].where(out['m']).ffill()), s.add(pd.Timedelta('1day'))),
End=lambda d: d['End'].mask((~d['m']) & d['End'].gt(s:=d['Start'].where(out['m']).bfill()), s.sub(pd.Timedelta('1day'))),
)
.query('Start<End').drop(columns='m')
)
希望这对你有帮助!
英文:
Full vectorial solution:
tmp = (pd.concat([df1, df2], keys=['ref', 'other']).rename_axis(index=['origin', 'index'])
.sort_values(by=['Start', 'End']).astype('datetime64').reset_index()
.assign(ref_next_start=lambda d: d['Start'].where(d['origin']=='ref').bfill(),
ref_prev_end=lambda d: d['End'].where(d['origin']=='ref').ffill(),
other_prev_end=lambda d: d['End'].where(d['origin']=='other').ffill(),
)
)
m = tmp['origin'].eq('ref')
out = pd.concat(
[(tmp.loc[tmp['End'].ge(tmp['ref_prev_end'])]
.assign(End=lambda d: d['End'].mask(d['origin'].shift(-1).eq('ref'),
d['ref_next_start']-pd.Timedelta("1d")))
)[['origin', 'Start', 'End']],
(tmp.loc[m & tmp['End'].le(tmp['other_prev_end']), ['End', 'other_prev_end']]
.rename(columns={'End': 'Start', 'other_prev_end': 'End'})
.assign(Start=lambda d: d['Start']+pd.Timedelta("1d"), origin='other')
)]
).sort_values(by=['Start', 'End'], ignore_index=True)
Output:
origin Start End
0 ref 2023-01-01 2023-01-15
1 other 2023-01-20 2023-01-31
2 ref 2023-02-01 2023-02-20
3 other 2023-02-25 2023-03-01
4 other 2023-03-05 2023-03-14
5 ref 2023-03-15 2023-04-01
6 other 2023-04-02 2023-04-15
Previous temporary solution that also doesn't compute the last interval, it should be relatively efficient:
tmp = pd.concat([df1, df2], keys=['df1', 'df2'], names=['origin']).reset_index(level=0)
tmp['Start'] = pd.to_datetime(tmp['Start'])
tmp['End'] = pd.to_datetime(tmp['End'])
out = (tmp
.sort_values(by=['Start', 'End'], ignore_index=True)
.assign(m=lambda d: d['origin'].eq('df1'),
Start=lambda d: d['Start'].mask((~d['m']) & d['Start'].lt(s:=d['End'].where(out['m']).ffill()), s.add(pd.Timedelta('1day'))),
End=lambda d: d['End'].mask((~d['m']) & d['End'].gt(s:=d['Start'].where(out['m']).bfill()), s.sub(pd.Timedelta('1day'))),
)
.query('Start<End').drop(columns='m')
)
Output:
origin Start End
0 df1 2023-01-01 2023-01-15
3 df2 2023-01-20 2023-01-31
4 df1 2023-02-01 2023-02-20
5 df2 2023-02-25 2023-03-01
6 df2 2023-03-05 2023-03-14
7 df1 2023-03-15 2023-04-01
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。




评论