Bootstrap
本文主要介绍Python pandas中,给出开始日期和结束日期的范围,在这个范围内,随机生成日期的方法。

1、使用np.random.randint和to_datetime生成

def random_dates(start, end, n=10):
start_u = start.value//10**9
end_u = end.value//10**9
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit='s')
>>> start = pd.to_datetime('2015-01-01')
>>> end = pd.to_datetime('2018-01-01')
>>> random_dates(start, end)
DatetimeIndex(['2016-10-08 07:34:13', '2015-11-15 06:12:48',
'2015-01-24 10:11:04', '2015-03-26 16:23:53',
'2017-04-01 00:38:21', '2015-05-15 03:47:54',
'2015-06-24 07:32:32', '2015-11-10 20:39:36',
'2016-07-25 05:48:09', '2015-03-19 16:05:19'],
dtype='datetime64[ns]', freq=None)

或者

def random_datetimes_or_dates(start, end, out_format='datetime', n=10): 
'''
unix timestamp is in ns by default.
I divide the unix time value by 10**9 to make it seconds (or 24*60*60*10**9 to make it days).
The corresponding unit variable is passed to the pd.to_datetime function.
Values for the (divide_by, unit) pair to select is defined by the out_format parameter.
for 1 -> out_format='datetime'
for 2 -> out_format=anything else
'''
(divide_by, unit) = (10**9, 's') if out_format=='datetime' else (24*60*60*10**9, 'D')
start_u = start.value//divide_by
end_u = end.value//divide_by
return pd.to_datetime(np.random.randint(start_u, end_u, n), unit=unit)
>>> start = pd.to_datetime('2015-01-01')
>>> end = pd.to_datetime('2018-01-01')
>>> random_datetimes_or_dates(start, end, out_format='datetime')
DatetimeIndex(['2017-01-30 05:14:27', '2016-10-18 21:17:16',
'2016-10-20 08:38:02', '2015-09-02 00:03:08',
'2015-06-04 02:38:12', '2016-02-19 05:22:01',

'2015-11-06 10:37:10', '2017-12-17 03:26:02',
'2017-11-20 06:51:32', '2016-01-02 02:48:03'],
dtype='datetime64[ns]', freq=None)
>>> random_datetimes_or_dates(start, end, out_format='not datetime')
DatetimeIndex(['2017-05-10', '2017-12-31', '2017-11-10', '2015-05-02',
'2016-04-11', '2015-11-27', '2015-03-29', '2017-05-21',
'2015-05-11', '2017-02-08'],
dtype='datetime64[ns]', freq=None)

2、使用np.random.rand和to_timedelta生成

def random_dates(start, end, n, unit='D', seed=None):
if not seed: # from piR's answer
np.random.seed(0)
ndays = (end - start).days + 1
return pd.to_timedelta(np.random.rand(n) * ndays, unit=unit) + start
>>> np.random.seed(0)
>>> start = pd.to_datetime('2015-01-01')
>>> end = pd.to_datetime('2018-01-01')
>>> random_dates(start, end, 10)
DatetimeIndex([ '2016-08-25 01:09:42.969600',
'2017-02-23 13:30:20.304000',
'2016-10-23 05:33:15.033600',
'2016-08-20 17:41:04.012799999',
'2016-04-09 17:59:00.815999999',
'2016-12-09 13:06:00.748800',
'2016-04-25 00:47:45.974400',
'2017-09-05 06:35:58.444800',
'2017-11-23 03:18:47.347200',
'2016-02-25 15:14:53.894400'],
dtype='datetime64[ns]', freq=None)

想要不重复的随机日期,你可以用np.random.choice与replace=False:

def random_dates2_unique(start, end, n, unit='D', seed=None):
if not seed: # from piR's answer
np.random.seed(0)
ndays = (end - start).days + 1
return start + pd.to_timedelta(
np.random.choice(ndays, n, replace=False), unit=unit
)

3、使用numpy.random.choice生成

def random_dates(start, end, n, freq, seed=None):
if seed is not None:
np.random.seed(seed)
dr = pd.date_range(start, end, freq=freq)
return pd.to_datetime(np.sort(np.random.choice(dr, n, replace=False)))
>>> random_dates('2015-01-01', '2018-01-01', 10, 'H', seed=[3, 1415])
DatetimeIndex(['2015-04-24 02:00:00', '2015-11-26 23:00:00',
'2016-01-18 00:00:00', '2016-06-27 22:00:00',
'2016-08-12 17:00:00', '2016-10-21 11:00:00',
'2016-11-07 11:00:00', '2016-12-09 23:00:00',
'2017-02-20 01:00:00', '2017-06-17 18:00:00'],
dtype='datetime64[ns]', freq=None)

4、使用numpy.random.permutation生成

def random_dates_2(start, end, n, freq, seed=None):
if seed is not None:
np.random.seed(seed)
dr = pd.date_range(start, end, freq=freq)
a = np.arange(len(dr))
b = np.sort(np.random.permutation(a)[:n])
return dr[b]
>>> random_dates('2015-01-01', '2018-01-01', 10, 'H', seed=[3, 1415])
DatetimeIndex(['2015-04-24 02:00:00', '2015-11-26 23:00:00',
'2016-01-18 00:00:00', '2016-06-27 22:00:00',
'2016-08-12 17:00:00', '2016-10-21 11:00:00',
'2016-11-07 11:00:00', '2016-12-09 23:00:00',
'2017-02-20 01:00:00', '2017-06-17 18:00:00'],
dtype='datetime64[ns]', freq=None)