英文:
Dataframe of start and end dates into sum of days in an array of periods
问题
这是你要翻译的代码部分的内容:
import operator
import pandas
def days_in_periods(df: pandas.DataFrame,
inc_st: bool = True,
inc_en: bool = True,
period_freq='M') -> pandas.DataFrame:
""" Calculate the days in each period covered by any contract defined within the dataframe """
day_range = pandas.date_range(df['start_date'].min(),
df['end_date'].max(),
freq='D').to_series(name='days').reset_index(drop=True)
if inc_st:
st_op = operator.le
else:
st_op = operator.lt
if inc_en:
en_op = operator.ge
else:
en_op = operator.gt
df = df.merge(day_range, how='cross')
df = (df.loc[st_op(df['start_date'], df['days'])
& en_op(df['end_date'], df['days'])]
.resample(on='days', rule=period_freq)
.size()
)
df.index = df.index.to_period(period_freq)
return df
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
我已将代码部分翻译为中文,去除了其他内容。如果有任何其他疑问或需要进一步的帮助,请随时告诉我。
英文:
I have a pandas data frame of start and end dates for contracts. I want to work out the number of in force contract days for all periods (e.g. months) covered by the contracts.
Example input:
start_date end_date
0 2022-01-01 2022-02-15
1 2022-02-01 2022-04-01
2 2022-03-01 2022-04-15
Resulting output:
2022-01 30
2022-02 41
2022-03 61
2022-04 14
Freq: M, dtype: int64
I have already written a working solution but it takes a fairly naive approach and I would appreciate suggestions for improved efficiency or more pandas/pythonic approaches.
Once I have figured out the minimum spanning set of periods the solution leaves behind array functions and uses a loop over rows and periods. I want to be able to apply this function to many millions of rows of a dataframe so efficiency will become important.
I looked for some array functions providing something like an overlap or a timedelta within a period but it seems start time and end time where the only useful tools available.
import pandas
def days_in_periods(df: pandas.DataFrame, inc_st: bool = True, inc_en: bool = True, period_freq='M') -> pandas.Series:
""" Calculate the days in each period covered by any contract defined within the dataframe """
# create period range
periods = pandas.period_range(start=df['start_date'].min(),
end=df['end_date'].max(),
freq=period_freq)
period_days = pandas.Series(data=[0] * len(periods),
index=periods,
dtype=int)
for index, row in df.iterrows():
st = row['start_date']
en = row['end_date']
print(f'contract: {st:%d/%m} - {en:%d/%m}')
total_days: int = (en - st).days + inc_en - (1 - inc_st)
print(f'contract days: {total_days}')
total_days_check: int = 0
for period in periods:
per_st = period.start_time
per_en = period.end_time
print(f'\tperiod: {per_st:%d/%m} - {per_en:%d/%m}', end='')
if per_en < st or per_st > en:
print('\t0')
continue
days: int = (per_en - per_st).days + 1
if per_st <= st <= per_en:
days -= (st - per_st).days + (1 - inc_st)
if per_st <= en <= per_en:
days -= (per_en - en).days + (1 - inc_en)
total_days_check += days
print(f'\t{days}')
period_days[period] += days
print(f'total days check: {total_days_check}')
assert total_days == total_days_check
return period_days
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
days_in_periods(df_ex, inc_st=True, inc_en=True)
days_in_periods(df_ex, inc_st=True, inc_en=False)
days_in_periods(df_ex, inc_st=False, inc_en=True)
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
Rewrite after sammywemmy's suggestions below:
import operator
import pandas
def days_in_periods(df: pandas.DataFrame,
inc_st: bool = True,
inc_en: bool = True,
period_freq='M') -> pandas.DataFrame:
""" Calculate the days in each period covered by any contract defined within the dataframe """
day_range = pandas.date_range(df['start_date'].min(),
df['end_date'].max(),
freq='D').to_series(name='days').reset_index(drop=True)
if inc_st:
st_op = operator.le
else:
st_op = operator.lt
if inc_en:
en_op = operator.ge
else:
en_op = operator.gt
df = df.merge(day_range, how='cross')
df = (df.loc[st_op(df['start_date'], df['days'])
& en_op(df['end_date'], df['days'])]
.resample(on='days', rule=period_freq)
.size()
)
df.index = df.index.to_period(period_freq)
return df
# create sample DataFrame
df_ex = pandas.DataFrame({'start_date': ['2022-01-01', '2022-02-01', '2022-03-01'],
'end_date': ['2022-02-15', '2022-04-01', '2022-04-15']})
# convert start_date and end_date to datetime objects
df_ex['start_date'] = pandas.to_datetime(df_ex['start_date'])
df_ex['end_date'] = pandas.to_datetime(df_ex['end_date'])
print(days_in_periods(df_ex, inc_st=False, inc_en=False))
答案1
得分: 2
看起来像是某种不等式连接 - 如果是这种情况,您可以使用 conditional_join 来获取结果,然后进行分组 - 这应该比使用 iterrows 更快:
# pip install pyjanitor
import pandas as pd
import janitor
# 创建一个日期的 Pandas series:
minimum = df_ex.to_numpy().min(axis=None)
maximum = df_ex.to_numpy().max(axis=None)
ser = pd.date_range(minimum, maximum, freq='D', name='dates').to_series()
ser.index = range(len(ser))
(df_ex
.conditional_join(
ser,
# 左侧列,右侧列,比较符
('start_date', 'dates', '<'),
('end_date', 'dates', '>'),
# 根据数据大小,可能使用 numba 可以提高性能
use_numba = False,
)
.loc(axis=1)[['dates']]
.resample(on='dates', rule='MS')
.size()
)
dates
2022-01-01 30
2022-02-01 41
2022-03-01 61
2022-04-01 14
Freq: MS, dtype: int64
请注意,这段代码是用于数据处理的,主要使用了 pandas 和 pyjanitor 库。
英文:
Looks like some form of inequality join - if that is the case, you can use conditional_join from pyjanitor to get your results, before grouping - should be faster than having to use iterrows:
# pip install pyjanitor
import pandas as pd
import janitor
# build a Pandas series of dates:
minimum = df_ex.to_numpy().min(axis=None)
maximum = df_ex.to_numpy().max(axis=None)
ser = pd.date_range(minimum, maximum, freq='D', name='dates').to_series()
ser.index = range(len(ser))
(df_ex
.conditional_join(
ser,
# column from left, column from right, comparator
('start_date', 'dates', '<'),
('end_date', 'dates', '>'),
# depending on the data size,
# you might get more performance with numba
use_numba = False,
)
.loc(axis=1)[['dates']]
.resample(on='dates', rule='MS')
.size()
)
dates
2022-01-01 30
2022-02-01 41
2022-03-01 61
2022-04-01 14
Freq: MS, dtype: int64
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论