英文:
Fill gaps in time intervals with other time intervals
问题
我们有两个带有时间间隔的表格。我想要填补df1
中的间隙,使用df2
,就像图中所示得到df3
。df1
保持不变,只有df2
中与df1
间隙(差异)重叠的部分被移到df3
中。
import pandas as pd
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15', '2023-04-18', '2023-05-15', '2023-05-25'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01', '2023-05-03', '2023-05-20', '2023-05-30']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05', '2023-04-18', '2023-05-12'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15', '2023-05-10', '2023-06-05']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02', '2023-04-18', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-21', '2023-05-25', '2023-05-31'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15', '2023-05-03', '2023-05-10', '2023-05-14', '2023-05-20', '2023-05-24', '2023-05-30', '2023-06-05']})
# 你的代码部分
import plotly.express as px
df_plot = pd.concat(
[
df1.assign(color='df1', df='df1'),
df2.assign(color='df2', df='df2'),
df3.assign(color=['df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2'], df='df3')
],
)
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
英文:
We have two tables with time intervals. I want to fill gaps in df1
with df2
as in the graph to get df3
. df1
is moved to df3
as it is, and only the parts of df2
that lie in the gaps of df1
(difference) are moved to df3
.
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15', '2023-04-18', '2023-05-15', '2023-05-25'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01', '2023-05-03', '2023-05-20', '2023-05-30']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05', '2023-04-18', '2023-05-12'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15', '2023-05-10', '2023-06-05']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02', '2023-04-18', '2023-05-04', '2023-05-12', '2023-05-15', '2023-05-21', '2023-05-25', '2023-05-31'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15', '2023-05-03', '2023-05-10', '2023-05-14', '2023-05-20', '2023-05-24', '2023-05-30', '2023-06-05']})
# df1
Start End
0 2023-01-01 2023-01-15
1 2023-02-01 2023-02-20
2 2023-03-15 2023-04-01
3 2023-04-18 2023-05-03
4 2023-05-15 2023-05-20
5 2023-05-25 2023-05-30
# df2
Start End
0 2023-01-02 2023-01-03
1 2023-01-05 2023-01-10
2 2023-01-20 2023-02-10
3 2023-02-25 2023-03-01
4 2023-03-05 2023-04-15
5 2023-04-18 2023-05-10
6 2023-05-12 2023-06-05
# df3 (desired result)
Start End
0 2023-01-01 2023-01-15
1 2023-01-20 2023-01-31
2 2023-02-01 2023-02-20
3 2023-02-25 2023-03-01
4 2023-03-05 2023-03-14
5 2023-03-15 2023-04-01
6 2023-04-02 2023-04-15
7 2023-04-18 2023-05-03
8 2023-05-04 2023-05-10
9 2023-05-12 2023-05-14
10 2023-05-15 2023-05-20
11 2023-05-21 2023-05-24
12 2023-05-25 2023-05-30
13 2023-05-31 2023-06-05
Code to generate plot:
import plotly.express as px
df_plot = pd.concat(
[
df1.assign(color='df1', df='df1'),
df2.assign(color='df2', df='df2'),
df3.assign(color=['df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2', 'df2', 'df1', 'df2', 'df1', 'df2'], df='df3')
],
)
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
答案1
得分: 3
我认为我可以让你接近:
df1 = pd.DataFrame({'开始': ['2023-01-01', '2023-02-01', '2023-03-15'],
'结束': ['2023-01-15', '2023-02-20', '2023-04-01']})
df2 = pd.DataFrame({'开始': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05'],
'结束': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15']})
df3 = pd.DataFrame({'开始': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02'],
'结束': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15']})
df1['日期'] = [pd.date_range(s,e) for s, e in zip(df1['开始'], df1['结束'])]
df2['日期'] = [pd.date_range(s,e) for s, e in zip(df2['开始'], df2['结束'])]
df1e = df1.explode('日期').assign(来源='df1')
df2e = df2.explode('日期').assign(来源='df2')
df3e = df1e.set_index(df1e['日期']).combine_first(df2e.set_index(df2e['日期']))
df3e['日期'] = pd.to_datetime(df3e['日期'])
df3e['分组'] = ((df3e['来源'] != df3e['来源'].shift()) |
(df3e['日期'] - df3e['日期'].shift() > pd.Timedelta(days=1))).cumsum()
df_out = df3e.groupby(['分组', '来源'])['日期'].agg([min, max])
英文:
I think I can get you close:
df1 = pd.DataFrame({'Start': ['2023-01-01', '2023-02-01', '2023-03-15'],
'End': ['2023-01-15', '2023-02-20', '2023-04-01']})
df2 = pd.DataFrame({'Start': ['2023-01-02', '2023-01-05', '2023-01-20', '2023-02-25', '2023-03-05'],
'End': ['2023-01-03', '2023-01-10', '2023-02-10', '2023-03-01', '2023-04-15']})
df3 = pd.DataFrame({'Start': ['2023-01-01', '2023-01-20', '2023-02-01', '2023-02-25', '2023-03-05', '2023-03-15', '2023-04-02'],
'End': ['2023-01-15', '2023-01-31', '2023-02-20', '2023-03-01', '2023-03-14', '2023-04-01', '2023-04-15']})
df1['dates'] = [pd.date_range(s,e) for s, e in zip(df1['Start'], df1['End'])]
df2['dates'] = [pd.date_range(s,e) for s, e in zip(df2['Start'], df2['End'])]
df1e = df1.explode('dates').assign(source='df1')
df2e = df2.explode('dates').assign(source='df2')
df3e = df1e.set_index(df1e['dates']).combine_first(df2e.set_index(df2e['dates']))
df3e['dates'] = pd.to_datetime(df3e['dates'])
df3e['group'] = ((df3e['source'] != df3e['source'].shift()) |
(df3e['dates'] - df3e['dates'].shift() > pd.Timedelta(days=1))).cumsum()
df_out = df3e.groupby(['group', 'source'])['dates'].agg([min, max])
Output:
min max
group source
1 df1 2023-01-01 2023-01-15
2 df2 2023-01-20 2023-01-31
3 df1 2023-02-01 2023-02-20
4 df2 2023-02-25 2023-03-01
5 df2 2023-03-05 2023-03-14
6 df1 2023-03-15 2023-04-01
7 df2 2023-04-02 2023-04-15
Graphical Output:
import plotly.express as px
df_out = df_out.reset_index().rename({'source':'color', 'min':'Start', 'max':'End'}, axis=1)
df_plot = pd.concat(
[
df1.assign(color='df1'),
df2.assign(color='df2'),
df_out
],
keys=['df1' , 'df2', 'df3']
).reset_index(level=0, names='df')
fig = px.timeline(df_plot, x_start="Start", x_end="End", y="df", color="color")
fig.update_yaxes(categoryorder='category descending')
fig.show()
Graph:
with updated dataset:
min max
group source
1 df1 2023-01-01 2023-01-15
2 df2 2023-01-20 2023-01-31
3 df1 2023-02-01 2023-02-20
4 df2 2023-02-25 2023-03-01
5 df2 2023-03-05 2023-03-14
6 df1 2023-03-15 2023-04-01
7 df2 2023-04-02 2023-04-15
8 df1 2023-04-18 2023-05-03
9 df2 2023-05-04 2023-05-10
10 df2 2023-05-12 2023-05-14
11 df1 2023-05-15 2023-05-20
12 df2 2023-05-21 2023-05-24
13 df1 2023-05-25 2023-05-30
14 df2 2023-05-31 2023-06-05
Graph output:
答案2
得分: 2
以下是代码部分的翻译:
tmp = (pd.concat([df1, df2], keys=['ref', 'other']).rename_axis(index=['origin', 'index'])
.sort_values(by=['Start', 'End']).astype('datetime64').reset_index()
.assign(ref_next_start=lambda d: d['Start'].where(d['origin']=='ref').bfill(),
ref_prev_end=lambda d: d['End'].where(d['origin']=='ref').ffill(),
other_prev_end=lambda d: d['End'].where(d['origin']=='other').ffill(),
)
)
m = tmp['origin'].eq('ref')
out = pd.concat(
[(tmp.loc[tmp['End'].ge(tmp['ref_prev_end'])]
.assign(End=lambda d: d['End'].mask(d['origin'].shift(-1).eq('ref'),
d['ref_next_start']-pd.Timedelta("1d")))
)[['origin', 'Start', 'End']],
(tmp.loc[m & tmp['End'].le(tmp['other_prev_end']), ['End', 'other_prev_end']]
.rename(columns={'End': 'Start', 'other_prev_end': 'End'})
.assign(Start=lambda d: d['Start']+pd.Timedelta("1d"), origin='other')
)]
).sort_values(by=['Start', 'End'], ignore_index=True)
tmp = pd.concat([df1, df2], keys=['df1', 'df2'], names=['origin']).reset_index(level=0)
tmp['Start'] = pd.to_datetime(tmp['Start'])
tmp['End'] = pd.to_datetime(tmp['End'])
out = (tmp
.sort_values(by=['Start', 'End'], ignore_index=True)
.assign(m=lambda d: d['origin'].eq('df1'),
Start=lambda d: d['Start'].mask((~d['m']) & d['Start'].lt(s:=d['End'].where(out['m']).ffill()), s.add(pd.Timedelta('1day'))),
End=lambda d: d['End'].mask((~d['m']) & d['End'].gt(s:=d['Start'].where(out['m']).bfill()), s.sub(pd.Timedelta('1day'))),
)
.query('Start<End').drop(columns='m')
)
希望这对你有帮助!
英文:
Full vectorial solution:
tmp = (pd.concat([df1, df2], keys=['ref', 'other']).rename_axis(index=['origin', 'index'])
.sort_values(by=['Start', 'End']).astype('datetime64').reset_index()
.assign(ref_next_start=lambda d: d['Start'].where(d['origin']=='ref').bfill(),
ref_prev_end=lambda d: d['End'].where(d['origin']=='ref').ffill(),
other_prev_end=lambda d: d['End'].where(d['origin']=='other').ffill(),
)
)
m = tmp['origin'].eq('ref')
out = pd.concat(
[(tmp.loc[tmp['End'].ge(tmp['ref_prev_end'])]
.assign(End=lambda d: d['End'].mask(d['origin'].shift(-1).eq('ref'),
d['ref_next_start']-pd.Timedelta("1d")))
)[['origin', 'Start', 'End']],
(tmp.loc[m & tmp['End'].le(tmp['other_prev_end']), ['End', 'other_prev_end']]
.rename(columns={'End': 'Start', 'other_prev_end': 'End'})
.assign(Start=lambda d: d['Start']+pd.Timedelta("1d"), origin='other')
)]
).sort_values(by=['Start', 'End'], ignore_index=True)
Output:
origin Start End
0 ref 2023-01-01 2023-01-15
1 other 2023-01-20 2023-01-31
2 ref 2023-02-01 2023-02-20
3 other 2023-02-25 2023-03-01
4 other 2023-03-05 2023-03-14
5 ref 2023-03-15 2023-04-01
6 other 2023-04-02 2023-04-15
Previous temporary solution that also doesn't compute the last interval, it should be relatively efficient:
tmp = pd.concat([df1, df2], keys=['df1', 'df2'], names=['origin']).reset_index(level=0)
tmp['Start'] = pd.to_datetime(tmp['Start'])
tmp['End'] = pd.to_datetime(tmp['End'])
out = (tmp
.sort_values(by=['Start', 'End'], ignore_index=True)
.assign(m=lambda d: d['origin'].eq('df1'),
Start=lambda d: d['Start'].mask((~d['m']) & d['Start'].lt(s:=d['End'].where(out['m']).ffill()), s.add(pd.Timedelta('1day'))),
End=lambda d: d['End'].mask((~d['m']) & d['End'].gt(s:=d['Start'].where(out['m']).bfill()), s.sub(pd.Timedelta('1day'))),
)
.query('Start<End').drop(columns='m')
)
Output:
origin Start End
0 df1 2023-01-01 2023-01-15
3 df2 2023-01-20 2023-01-31
4 df1 2023-02-01 2023-02-20
5 df2 2023-02-25 2023-03-01
6 df2 2023-03-05 2023-03-14
7 df1 2023-03-15 2023-04-01
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论