n_samples = 1_000
n_rows = 10_000

sample_ids = np.arange(n_samples).repeat(n_rows//n_samples).reshape(-1,1)
feat_ids = np.tile(np.arange(n_rows // n_samples), n_samples).reshape(-1,1)
cont = np.random.randn(n_rows, 6)
ind_cat = np.random.randint(0, 3, (n_rows, 1))
target = np.array(['a', 'b', 'c'])[ind_cat]
ind_cat2 = np.random.randint(0, 3, (n_rows, 1))
target2 = np.array(['a', 'b', 'c'])[ind_cat2]
data = np.concatenate([sample_ids, feat_ids, cont, target, target], -1)
columns = ['sample_id', 'feat_id'] + (np.arange(6) + 1).astype(str).tolist() + ['target'] + ['target2']
df = pd.DataFrame(data, columns=columns)
idx = np.random.choice(np.arange(len(df)), len(df), False)
new_dtypes = {'sample_id':np.int32, 'feat_id':np.int32, '1':np.float32, '2':np.float32, '3':np.float32, '4':np.float32, '5':np.float32, '6':np.float32}
df = df.astype(dtype=new_dtypes)
df = df.loc[idx].reset_index(drop=True)
df

def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', data_cols=['1', '2', '3', '4', '5', '6'], target_col=['target'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000,))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target']].values).mode), y[rand_idx])

def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='feat_id', target_col=['target', 'target2'], y_func=y_func)
test_eq(X.shape, (1000, 10, 6))
test_eq(y.shape, (1000, 2))
rand_idx = np.random.randint(0, np.max(df.sample_id))
sorted_df = df.sort_values(by=['sample_id', 'feat_id']).reset_index(drop=True)
test_eq(X[rand_idx], sorted_df[sorted_df.sample_id == rand_idx][['1', '2', '3', '4', '5', '6']].values)
test_eq(np.squeeze(scipy.stats.mode(sorted_df[sorted_df.sample_id == rand_idx][['target', 'target2']].values).mode), y[rand_idx])

from io import StringIO
TESTDATA = StringIO("""sample_id;value_0;value_1;target
    rob;2;3;hot
    alice;6;7;lukewarm
    eve;11;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
X, y = df2Xy(df, sample_col='sample_id', target_col='target', data_cols=['value_0', 'value_1'], sort_by='sample_id')
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3,))
X, y

(array([[[ 6,  7]],
 
        [[11, 12]],
 
        [[ 2,  3]]]),
 array(['lukewarm', 'cold', 'hot'], dtype=object))

TESTDATA = StringIO("""sample_id;timestep;values;target
    rob;1;2;hot
    alice;1;6;lukewarm
    eve;1;11;cold
    
    rob;2;3;hot
    alice;2;7;lukewarm
    eve;2;12;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', target_col='target', data_cols=['values'], sort_by='timestep', to3d=True, y_func=y_func)
test_eq(X.shape, (3, 1, 2))
test_eq(y.shape, (3, ))
print(X, y)

[[[ 6 11]]

 [[ 2  7]]

 [[12  3]]] ['lukewarm' 'cold' 'hot']

TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col='target', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3,))

[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] ['lukewarm' 'cold' 'hot']

TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target1;target2
    rob;green;2;3;hot;good
    rob;yellow;3;4;hot;good
    rob;blue;4;5;hot;good
    rob;red;5;6;hot;good
    alice;green;6;7;lukewarm;good
    alice;yellow;7;8;lukewarm;good
    alice;blue;8;9;lukewarm;good
    alice;red;9;10;lukewarm;good
    eve;yellow;11;12;cold;bad
    eve;green;10;11;cold;bad
    eve;blue;12;12;cold;bad
    eve;red;13;14;cold;bad
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', target_col=['target1', 'target2'], data_cols=['value_0', 'value_1'], y_func=y_func)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, 2))
print(X, y)

[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] [['lukewarm' 'good']
 ['cold' 'bad']
 ['hot' 'good']]

TESTDATA = StringIO("""sample_id;trait;value_0;value_1;target
    rob;green;2;3;hot
    rob;yellow;3;4;hot
    rob;blue;4;5;hot
    rob;red;5;6;hot
    alice;green;6;7;lukewarm
    alice;yellow;7;8;lukewarm
    alice;blue;8;9;lukewarm
    alice;red;9;10;lukewarm
    eve;yellow;11;12;cold
    eve;green;10;11;cold
    eve;blue;12;12;cold
    eve;red;13;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
idx = np.random.choice(len(df), len(df), False)
df = df.iloc[idx]
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', data_cols=['value_0', 'value_1'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y, None)

[[[ 8  9]
  [ 6  7]
  [ 9 10]
  [ 7  8]]

 [[12 12]
  [10 11]
  [13 14]
  [11 12]]

 [[ 4  5]
  [ 2  3]
  [ 5  6]
  [ 3  4]]] None

TESTDATA = StringIO("""sample_id;trait;timestep;values;target
    rob;green;1;2;hot
    rob;yellow;1;3;hot
    rob;blue;1;4;hot
    rob;red;1;5;hot
    alice;green;1;6;lukewarm
    alice;yellow;1;7;lukewarm
    alice;blue;1;8;lukewarm
    alice;red;1;9;lukewarm
    eve;yellow;1;11;cold
    eve;green;1;10;cold
    eve;blue;1;12;cold
    eve;red;1;13;cold
    
    rob;green;2;3;hot
    rob;yellow;2;4;hot
    rob;blue;2;5;hot
    rob;red;2;6;hot
    alice;green;2;7;lukewarm
    alice;yellow;2;8;lukewarm
    alice;blue;2;9;lukewarm
    alice;red;2;10;lukewarm
    eve;yellow;2;12;cold
    eve;green;2;11;cold
    eve;blue;2;13;cold
    eve;red;2;14;cold
    """)

df = pd.read_csv(TESTDATA, sep=";")
display(df)
def y_func(o): return scipy.stats.mode(o, axis=1).mode
X, y = df2xy(df, sample_col='sample_id', feat_col='trait', sort_by='timestep', target_col='target', data_cols=['values'], y_func=y_func)
print(X, y)
test_eq(X.shape, (3, 4, 2))
test_eq(y.shape, (3, ))

[[[ 8  6]
  [ 9  7]
  [12 10]
  [13 11]]

 [[ 4  2]
  [ 5  3]
  [ 9  7]
  [10  8]]

 [[13 11]
  [14 12]
  [ 5  3]
  [ 6  4]]] ['lukewarm' 'cold' 'hot']

user = np.array([1,2]).repeat(4).reshape(-1,1)
val = np.random.rand(8, 3)
data = np.concatenate([user, val], axis=-1)
df = pd.DataFrame(data, columns=['user', 'x1', 'x2', 'x3'])
test_eq(df2np3d(df, ['user'], ['x1', 'x2', 'x3']).shape, (2, 3, 4))

data = np.random.randn(10, 2)
mask = data > .8
data[mask] = np.nan
df = pd.DataFrame(data, columns=['A', 'B'])
df = add_missing_value_cols(df, cols=None, dtype=float)
test_eq(df['A'].isnull().sum(), df['missing_A'].sum())
test_eq(df['B'].isnull().sum(), df['missing_B'].sum())
df

today = datetime.now()
dates = pd.date_range('2021-05-01', '2021-05-07')
dates2 = np.concatenate([dates[:3], dates[-2:]])
dates3 = pd.date_range(dates2.min(), dates2.max())
test_eq(dates, dates3)
data = np.zeros((len(dates2), 3))
data[:, 0] = dates2
data[:, 1] = np.random.rand(len(dates2))
data[:, 2] = np.array([0, 1, 0, 0, 1])
cols = ['date', 'a', 'b']
date_df = pd.DataFrame(data, columns=cols)
date_df['date'] = pd.to_datetime(date_df['date'])
date_df

add_missing_timestamps(date_df, 'date')

add_missing_timestamps(date_df, 'date', groupby='b')

add_missing_timestamps(date_df, 'date', groupby='b', range_by_group=False)

for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    tdf = pd.DataFrame(pd.date_range('2021-03-01', datetime.today()), columns=['date'])
    a,b = time_encoding(tdf.date, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()

for freq in ['microsecond', 'second', 'minute', 'hour', 'day', 'dayofweek', 'dayofyear', 'month']:
    dateindex = pd.date_range('2021-03-01', datetime.today())
    a,b = time_encoding(dateindex, freq=freq)
    plt.plot(a)
    plt.plot(b)
    plt.title(freq)
    plt.show()

dow_sin, dow_cos = time_encoding(date_df['date'], 'dayofweek')
plt.plot(dow_sin)
plt.plot(dow_cos)
plt.title('DayOfWeek')
plt.show()
date_df['dow_sin'] = dow_sin
date_df['dow_cos'] = dow_cos
date_df

date_df = add_missing_timestamps(date_df, 'date', groupby='b')
add_delta_timestamp_cols(date_df, cols='a')

t = torch.randn(2, 3, 5)
t[t<0] = np.nan
mask = torch.isnan(t)
print(t)
delta_timestamps_torch(mask), delta_timestamps_torch(mask, 'backward')

tensor([[[   nan, 0.9711,    nan, 0.4744,    nan],
         [   nan, 0.5079,    nan,    nan, 0.3336],
         [   nan,    nan,    nan, 0.3110,    nan]],

        [[0.4117,    nan,    nan, 1.7327, 0.3298],
         [1.0278, 1.0964,    nan,    nan, 1.3388],
         [2.1505,    nan, 0.7078,    nan,    nan]]])

(tensor([[[1., 2., 1., 2., 1.],
          [1., 2., 1., 2., 3.],
          [1., 2., 3., 4., 1.]],
 
         [[1., 1., 2., 3., 1.],
          [1., 1., 1., 2., 3.],
          [1., 1., 2., 1., 2.]]]),
 tensor([[[1., 2., 1., 2., 1.],
          [1., 3., 2., 1., 1.],
          [3., 2., 1., 2., 1.]],
 
         [[3., 2., 1., 1., 1.],
          [1., 3., 2., 1., 1.],
          [2., 1., 3., 2., 1.]]]))

t = torch.rand(1, 2, 8)
arr = t.numpy()
t[t <.6] = np.nan
test_eq(torch.diff(nearest_gaps(t)).min().item(), -1)
test_eq(np.diff(nearest_gaps(arr)).min(), -1)
test_eq(torch.diff(nearest_gaps(t)).max().item(), 1)
test_eq(np.diff(nearest_gaps(arr)).max(),1)
test_eq(torch.isnan(forward_gaps(t)).sum(), 0)
test_eq(np.isnan(forward_gaps(arr)).sum(), 0)
ag = all_gaps(t)
test_eq(ag.shape, (1,6,8))
test_eq(torch.isnan(ag).sum(), 0)

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y

wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X

input shape: (13, 3)

array([[[ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  1.,  2.,  3.,  4.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 5.,  6.,  7.,  8.,  9.],
        [ 0.,  0.,  0.,  0.,  0.]],

       [[10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [10., 11., 12., nan, nan],
        [ 0.,  0.,  0.,  1.,  1.]]])

wl = 5
t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl)(t)
test_eq(X.shape[1:], (1, wl))
itemify(X,)

input shape: (10,)

(#5) [(array([[0, 1, 2, 3, 4]]),),(array([[1, 2, 3, 4, 5]]),),(array([[2, 3, 4, 5, 6]]),),(array([[3, 4, 5, 6, 7]]),),(array([[4, 5, 6, 7, 8]]),)]

wl = 5
h = 1

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())

input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9)]

wl = 5
h = 2 # 2 or more

t = np.arange(10)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, horizon=h)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))

input shape: (10,)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]

wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, (2, ))

input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]

wl = 5
h = 2 # 2 or more

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=1, horizon=h, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))

input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), array([5, 6])), (array([[1, 2, 3, 4, 5]]), array([6, 7])), (array([[2, 3, 4, 5, 6]]), array([7, 8])), (array([[3, 4, 5, 6, 7]]), array([8, 9]))]

wl = 5

t = np.arange(10).reshape(1, -1)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=3, horizon=1, get_y=None, seq_first=False)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())

input shape: (1, 10)
[(array([[0, 1, 2, 3, 4]]), 5), (array([[3, 4, 5, 6, 7]]), 8)]

wl = 5
start = 3

t = np.arange(20)
print('input shape:', t.shape)
X, y = SlidingWindow(wl, stride=None, horizon=1, start=start)(t)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())

input shape: (20,)
[(array([[3, 4, 5, 6, 7]]), 8), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[13, 14, 15, 16, 17]]), 18)]

wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())

input shape: (20,)

[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]

wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var'])
display(df)
X, y = SlidingWindow(wl, stride=1, horizon=1, get_y=None)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())

input shape: (20,)

[(array([[0, 1, 2, 3, 4]]), 5), (array([[1, 2, 3, 4, 5]]), 6), (array([[2, 3, 4, 5, 6]]), 7), (array([[3, 4, 5, 6, 7]]), 8), (array([[4, 5, 6, 7, 8]]), 9), (array([[5, 6, 7, 8, 9]]), 10), (array([[ 6,  7,  8,  9, 10]]), 11), (array([[ 7,  8,  9, 10, 11]]), 12), (array([[ 8,  9, 10, 11, 12]]), 13), (array([[ 9, 10, 11, 12, 13]]), 14), (array([[10, 11, 12, 13, 14]]), 15), (array([[11, 12, 13, 14, 15]]), 16), (array([[12, 13, 14, 15, 16]]), 17), (array([[13, 14, 15, 16, 17]]), 18), (array([[14, 15, 16, 17, 18]]), 19)]

wl = 5

t = np.arange(20)
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=['var']).T
display(df)
X, y = SlidingWindow(wl, stride=None, horizon=1, get_y=None, seq_first=False)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (1, wl))
test_eq(items[0][1].shape, ())

input shape: (20,)

[(array([[0, 1, 2, 3, 4]]), 5), (array([[5, 6, 7, 8, 9]]), 10), (array([[10, 11, 12, 13, 14]]), 15)]

wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))

input shape: torch.Size([10, 3])

[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), array([  5,  50, 500])), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), array([  6,  60, 600])), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), array([  7,  70, 700])), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), array([  8,  80, 800])), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), array([  9,  90, 900]))]

wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
display(df)
X, y = SlidingWindow(wl, horizon=1, get_y=0)(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars, wl))

input shape: torch.Size([10, 3])

[(array([[  0,   1,   2,   3,   4],
       [  0,  10,  20,  30,  40],
       [  0, 100, 200, 300, 400]]), 5), (array([[  1,   2,   3,   4,   5],
       [ 10,  20,  30,  40,  50],
       [100, 200, 300, 400, 500]]), 6), (array([[  2,   3,   4,   5,   6],
       [ 20,  30,  40,  50,  60],
       [200, 300, 400, 500, 600]]), 7), (array([[  3,   4,   5,   6,   7],
       [ 30,  40,  50,  60,  70],
       [300, 400, 500, 600, 700]]), 8), (array([[  4,   5,   6,   7,   8],
       [ 40,  50,  60,  70,  80],
       [400, 500, 600, 700, 800]]), 9)]

wl = 5
n_vars = 3

t = (torch.stack(n_vars * [torch.arange(10)]).T * tensor([1, 10, 100]))
print('input shape:', t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(wl, horizon=1, get_x=columns[:-1], get_y='target')(df)
items = itemify(X, y)
print(items)
test_eq(items[0][0].shape, (n_vars-1, wl))
test_eq(items[0][1].shape, ())

input shape: torch.Size([10, 3])

[(array([[ 0,  1,  2,  3,  4],
       [ 0, 10, 20, 30, 40]]), 500), (array([[ 1,  2,  3,  4,  5],
       [10, 20, 30, 40, 50]]), 600), (array([[ 2,  3,  4,  5,  6],
       [20, 30, 40, 50, 60]]), 700), (array([[ 3,  4,  5,  6,  7],
       [30, 40, 50, 60, 70]]), 800), (array([[ 4,  5,  6,  7,  8],
       [40, 50, 60, 70, 80]]), 900)]

n_vars = 3

t = (np.random.rand(1000, n_vars) - .5).cumsum(0)
print(t.shape)
plt.plot(t)
plt.show()
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=[0,1], get_y=2)(t)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)

(1000, 3)

(200, 2, 5) (200,)

wl = 5
n_vars = 3

t = (np.random.rand(100, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, horizon=0, get_x=columns[:-1], get_y='target')(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)

(100, 3)

(96, 2, 5) (96,)

seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)

(100, 5)

(96, 4, 5) (96,)

seq_len = 100
n_vars = 5

t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)

(100, 5)

(96, 4, 5) (96,)

seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)] + ['target']
df = pd.DataFrame(t, columns=columns).T
display(df)
X, y = SlidingWindow(5, stride=None, horizon=0, get_x=columns[:-1], get_y='target', seq_first=False)(df)
test_eq(X[0].shape, (n_vars-1, wl))
test_eq(y[0].shape, ())
print(X.shape, y.shape)

(100, 5)

(20, 4, 5) (20,)

seq_len = 100
n_vars = 5
t = (np.random.rand(seq_len, n_vars) - .5).cumsum(0)
print(t.shape)
columns=[f'var_{i}' for i in range(n_vars-1)]+['target']
df = pd.DataFrame(t, columns=columns)
display(df)
X, y = SlidingWindow(5, stride=1, horizon=0, get_x=columns[:-1], get_y='target', seq_first=True)(df)
splits = TrainValidTestSplitter(valid_size=.2, shuffle=False)(y)
X.shape, y.shape, splits

(100, 5)

((96, 4, 5),
 (96,),
 ((#77) [0,1,2,3,4,5,6,7,8,9...], (#19) [77,78,79,80,81,82,83,84,85,86...]))

                    * 0 for last step in each sub-window.
                    * n > 0 for a range of n future steps (1 to n).
                    * n < 0 for a range of n past steps (-n + 1 to 0).
                    * list : for those exact timesteps.
get_x               = indices of columns that contain the independent variable (xs). If None, all data will be used as x.
get_y               = indices of columns that contain the target (ys). If None, all data will be used as y. [] means no y data is created (unlabeled data).
y_func              = function to calculate the ys based on the get_y col/s and each y sub-window. y_func must be a function applied to axis=1!
seq_first           = True if input shape (seq_len, n_vars), False if input shape (n_vars, seq_len)
sort_by             = column/s used for sorting the array in ascending order
ascending           = used in sorting
check_leakage       = checks if there's leakage in the output between X and y
return_key          = when True, the key corresponsing to unique_id_cols for each sample is returned
verbose             = controls verbosity. True or 1 displays progress bar. 2 or more show records that cannot be created due to its length.

samples = 100_000
wl = 5
n_vars = 10

t = (torch.stack(n_vars * [torch.arange(samples)]).T * tensor([10**i for i in range(n_vars)]))
df = pd.DataFrame(t, columns=[f'var_{i}' for i in range(n_vars)])
df['time'] = np.arange(len(t))
df['device'] = 0
df['target'] = np.random.randint(0, 2, len(df))
df2 = df.copy()
df3 = df.copy()
cols = ['var_0', 'var_1', 'var_2', 'device', 'target']
df2[cols] = df2[cols] + 1
df3[cols] = df3[cols] + 2
df2 = df2.loc[:3]
df['region'] = 'A'
df2['region'] = 'A'
df3['region'] = 'B'
df = df.append(df2).append(df3).reset_index(drop=True)
df['index'] = np.arange(len(df))
df = df.sample(frac=1).reset_index(drop=True)
display(df.head())
df.shape

(200004, 15)

X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=False)(df)
X.shape, y.shape

((199992, 10, 5), (199992,))

X, y, key = SlidingWindowPanel(window_len=5, unique_id_cols=['device'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                               horizon=0, seq_first=True, sort_by=['time'], ascending=True, return_key=True)(df)
X.shape, y.shape, key.shape

((199992, 10, 5), (199992,), (199992,))

X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          horizon=0, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape

((199992, 10, 5), (199992,))

def y_max(o): return np.max(o, axis=1)

X, y = SlidingWindowPanel(window_len=5, unique_id_cols=['device', 'region'], stride=1, start=0, get_x=df.columns[:n_vars], get_y=['target'], 
                          y_func=y_max, horizon=5, seq_first=True, sort_by=['time'], ascending=True)(df)
X.shape, y.shape

((199982, 10, 5), (199982,))

wl = 5
stride = 5

t = np.repeat(np.arange(13).reshape(-1,1), 3, axis=-1)
print('input shape:', t.shape)
X, _ = SlidingWindow(wl, stride=stride, pad_remainder=True, get_y=[])(t)
X = tensor(X)
X[0, 1, -2:] = np.nan
X[1,..., :3] = np.nan
print(X)
identify_padding(torch.isnan(X).float())

input shape: (13, 3)
tensor([[[ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  1.,  2., nan, nan],
         [ 0.,  1.,  2.,  3.,  4.],
         [ 0.,  0.,  0.,  0.,  0.]],

        [[nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  8.,  9.],
         [nan, nan, nan,  0.,  0.]],

        [[10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [10., 11., 12., nan, nan],
         [ 0.,  0.,  0.,  1.,  1.]]])

tensor([[[0., 0., 0., 0., 0.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.],
         [1., 1., 1., 0., 0.]],

        [[0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0.]]])

	sample_id	feat_id	1	2	3	4	5	6	target	target2
0	803	8	0.067161	-0.986001	1.019658	1.344542	1.540479	0.302584	c	c
1	44	6	2.120347	-0.543931	1.727680	0.161199	1.238199	0.593076	c	c
2	780	6	-0.540911	0.595021	1.187320	0.077753	1.350728	0.389279	c	c
3	557	4	0.120909	1.783199	1.634337	0.115997	0.593131	0.013971	a	a
4	244	2	1.076255	-0.133813	0.289658	-0.524156	-0.062961	0.409407	b	b
...	...	...	...	...	...	...	...	...	...	...
9995	82	7	1.405801	0.836629	1.209675	-0.514467	0.415029	-1.309704	b	b
9996	130	3	0.869220	0.635056	0.727128	0.211559	0.826354	-0.135654	a	a
9997	756	8	0.440011	0.199001	0.145893	0.602251	-0.064006	-0.357997	b	b
9998	580	1	0.241159	0.880484	-0.171348	-0.550948	0.653416	-2.418535	b	b
9999	670	3	-0.838064	2.130403	-0.832883	-0.269318	0.209982	-0.636478	c	c

	A	B	missing_A	missing_B
0	-1.512289	-0.834380	0.0	0.0
1	0.042971	NaN	0.0	1.0
2	-1.135203	-1.474360	0.0	0.0
3	-2.390697	-1.619872	0.0	0.0
4	NaN	0.353415	1.0	0.0
5	-0.242936	0.230148	0.0	0.0
6	-0.386489	-0.821250	0.0	0.0
7	0.001340	-2.599201	0.0	0.0
8	0.074308	NaN	0.0	1.0
9	0.171828	-1.438286	0.0	0.0

	date	a	b
0	2021-05-01	0.308879	0.0
1	2021-05-02	0.460018	1.0
2	2021-05-03	0.029961	0.0
3	2021-05-06	0.797310	0.0
4	2021-05-07	0.239729	1.0

	date	a	b
0	2021-05-01	0.308879	0.0
1	2021-05-02	0.460018	1.0
2	2021-05-03	0.029961	0.0
3	2021-05-04	NaN	NaN
4	2021-05-05	NaN	NaN
5	2021-05-06	0.797310	0.0
6	2021-05-07	0.239729	1.0

	date	a	b
0	2021-05-01	0.308879	0.0
1	2021-05-02	NaN	0.0
2	2021-05-03	0.029961	0.0
3	2021-05-04	NaN	0.0
4	2021-05-05	NaN	0.0
5	2021-05-06	0.797310	0.0
6	2021-05-02	0.460018	1.0
7	2021-05-03	NaN	1.0
8	2021-05-04	NaN	1.0
9	2021-05-05	NaN	1.0
10	2021-05-06	NaN	1.0
11	2021-05-07	0.239729	1.0

Data preparation

`df2Xy`[source]

`split_Xy`[source]

`df2np3d`[source]

`add_missing_value_cols`[source]

`add_missing_timestamps`[source]

`time_encoding`[source]

`delta_timestamps`[source]

`add_delta_timestamp_cols`[source]

`delta_timestamps_torch`[source]

`forward_gaps`[source]

`backward_gaps`[source]

`nearest_gaps`[source]

`all_gaps`[source]

`SlidingWindow`[source]

`SlidingWindowPanel`[source]

`identify_padding`[source]

	sample_id	timestep	values	target
0	rob	1	2	hot
1	alice	1	6	lukewarm
2	eve	1	11	cold
3	rob	2	3	hot
4	alice	2	7	lukewarm
5	eve	2	12	cold

	sample_id	trait	value_0	value_1	target
5	alice	yellow	7	8	lukewarm
10	eve	blue	12	12	cold
1	rob	yellow	3	4	hot
4	alice	green	6	7	lukewarm
11	eve	red	13	14	cold
8	eve	yellow	11	12	cold
3	rob	red	5	6	hot
6	alice	blue	8	9	lukewarm
7	alice	red	9	10	lukewarm
0	rob	green	2	3	hot
9	eve	green	10	11	cold
2	rob	blue	4	5	hot

	sample_id	trait	value_0	value_1	target1	target2
0	rob	green	2	3	hot	good
1	rob	yellow	3	4	hot	good
2	rob	blue	4	5	hot	good
3	rob	red	5	6	hot	good
4	alice	green	6	7	lukewarm	good
5	alice	yellow	7	8	lukewarm	good
6	alice	blue	8	9	lukewarm	good
7	alice	red	9	10	lukewarm	good
8	eve	yellow	11	12	cold	bad
9	eve	green	10	11	cold	bad
10	eve	blue	12	12	cold	bad
11	eve	red	13	14	cold	bad

	date	a	b	dow_sin	dow_cos
0	2021-05-01	0.308879	0.0	-0.974928	-0.222521
1	2021-05-02	0.460018	1.0	-0.781831	0.623490
2	2021-05-03	0.029961	0.0	0.000000	1.000000
3	2021-05-06	0.797310	0.0	0.433884	-0.900969
4	2021-05-07	0.239729	1.0	-0.433884	-0.900969

	var
0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9
10	10
11	11
12	12
13	13
14	14
15	15
16	16
17	17
18	18
19	19

	var
0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9
10	10
11	11
12	12
13	13
14	14
15	15
16	16
17	17
18	18
19	19

	var_0	var_1	var_2
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	var_2
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	target
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	target
0	-0.406232	-0.414089	-0.400882
1	-0.471705	-0.298944	-0.208881
2	-0.945880	-0.323464	0.148857
3	-0.993279	-0.603344	0.492366
4	-1.322766	-0.383789	0.639112
...	...	...	...
95	-4.658289	-1.637059	0.216876
96	-4.401153	-1.764843	0.375077
97	-4.043994	-2.080173	-0.113987
98	-4.382016	-1.957085	0.271832
99	-4.070003	-1.514759	0.242690

	var_0	var_1	var_2	var_3	target
0	0.324556	-0.024634	-0.324003	0.152470	-0.079307
1	-0.100608	-0.453363	-0.543349	-0.232345	-0.161893
2	0.107114	-0.404652	-0.924248	-0.217626	0.099639
3	0.448838	-0.134955	-1.360108	-0.373151	0.332245
4	0.036596	0.334262	-1.529533	-0.430651	0.121113
...	...	...	...	...	...
95	-2.867835	5.313318	-1.985245	5.173193	5.231640
96	-2.555572	5.058395	-2.221140	5.327731	5.614501
97	-2.972194	5.111336	-2.319026	5.787613	5.755481
98	-2.546907	4.736859	-2.575651	6.065751	5.269597
99	-2.738738	4.606884	-2.828786	6.186194	5.157103

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
var_0	0.142038	-0.241951	-0.674646	-0.329895	-0.806373	-0.504457	-0.091007	0.041660	0.226693	0.717070	...	0.714998	1.104654	1.392867	1.526051	1.350011	0.929920	1.392436	1.278610	1.566185	1.434742
var_1	0.344013	0.371486	-0.021897	-0.374331	-0.495772	-0.239106	-0.522420	-0.162693	-0.502810	-0.784349	...	0.844331	0.470277	0.033467	0.326548	0.186578	0.101807	0.297498	0.590168	0.410756	0.091508
var_2	-0.453039	-0.674366	-0.934097	-0.846350	-0.551366	-0.488994	-0.287651	-0.636802	-0.249811	-0.495553	...	-0.282373	-0.547764	-0.671241	-0.368870	0.107557	0.199508	0.331823	0.685088	0.207348	0.115582
var_3	-0.075391	-0.440138	-0.401258	-0.742387	-0.992271	-1.093716	-1.291920	-0.962884	-0.913331	-0.999138	...	1.212462	1.431318	1.220678	1.308815	1.180004	1.401881	1.529010	1.905840	2.008138	1.925418
target	0.101466	-0.075994	-0.392708	-0.268995	0.058228	0.262021	-0.162348	-0.520289	-0.147791	-0.552750	...	-1.907877	-2.128015	-1.630508	-1.739854	-2.097548	-1.910264	-1.519971	-1.362846	-1.589207	-1.388956

	0	1	2	3	4	5	6	7	8	9	...	90	91	92	93	94	95	96	97	98	99
var_0	0.333079	0.667278	0.555304	0.525358	0.852389	0.851425	1.002733	0.503167	0.590213	0.684278	...	0.804845	0.531329	0.661458	0.957019	0.784238	0.338456	-0.158813	0.286519	0.072473	0.428845
var_1	0.434652	0.617642	0.906403	1.220523	1.396725	1.392244	1.738090	1.251832	1.318959	1.433546	...	3.907298	3.613984	3.343749	3.277451	3.512521	3.204122	2.772183	2.611075	2.587330	2.281425
var_2	-0.156482	-0.583163	-0.531841	-0.638756	-0.239909	0.012355	0.018534	0.336374	0.200241	-0.143604	...	0.062639	0.348671	0.708028	0.490546	0.364221	-0.106238	0.083789	-0.053816	-0.372394	-0.346548
var_3	0.061416	0.180442	-0.273987	-0.493369	-0.529733	-0.848908	-0.684956	-0.279506	-0.493934	-0.260583	...	3.471103	3.250781	2.950769	3.018413	2.626885	2.602813	3.008129	2.994499	2.726762	2.260312
target	0.063443	-0.433004	0.060555	-0.229963	-0.678588	-0.179883	-0.101206	-0.140428	-0.114329	-0.247860	...	3.094354	3.543678	3.484297	3.504939	3.472951	3.840648	4.188283	4.479545	4.571833	4.149634

	var_0	var_1	var_2	var_3	target
0	0.116937	0.033255	0.112208	-0.269714	0.426030
1	-0.280513	0.275669	0.557621	-0.300125	0.416640
2	-0.399886	0.451080	0.395604	-0.599066	0.786052
3	-0.302637	0.440277	-0.067934	-0.343010	0.720093
4	-0.616455	0.343279	-0.299689	-0.668935	0.602427
...	...	...	...	...	...
95	-6.978802	-5.309753	-5.987701	1.347099	4.823679
96	-6.500155	-5.724755	-6.109362	1.583891	4.635394
97	-6.364807	-5.350699	-5.934773	1.895214	4.501355
98	-6.212461	-4.956514	-6.146901	2.242560	4.947337
99	-6.051990	-4.517483	-5.755917	1.762457	4.797648

	var
0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9
10	10
11	11
12	12
13	13
14	14
15	15
16	16
17	17
18	18
19	19

	var
0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9
10	10
11	11
12	12
13	13
14	14
15	15
16	16
17	17
18	18
19	19

	var_0	var_1	var_2
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	var_2
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	target
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	var_2	var_3	var_4	var_5	var_6	var_7	var_8	var_9	time	device	target	region	index
0	4839	48390	483900	4839000	48390000	483900000	4839000000	48390000000	483900000000	4839000000000	4839	0	0	A	4839
1	56391	563892	5638902	56389000	563890000	5638900000	56389000000	563890000000	5638900000000	56389000000000	56389	2	2	B	156393
2	78837	788352	7883502	78835000	788350000	7883500000	78835000000	788350000000	7883500000000	78835000000000	78835	2	2	B	178839
3	28645	286450	2864500	28645000	286450000	2864500000	28645000000	286450000000	2864500000000	28645000000000	28645	0	1	A	28645
4	41963	419630	4196300	41963000	419630000	4196300000	41963000000	419630000000	4196300000000	41963000000000	41963	0	0	A	41963

Data preparation

df2Xy[source]

split_Xy[source]

df2np3d[source]

add_missing_value_cols[source]

add_missing_timestamps[source]

time_encoding[source]

delta_timestamps[source]

add_delta_timestamp_cols[source]

delta_timestamps_torch[source]

forward_gaps[source]

backward_gaps[source]

nearest_gaps[source]

all_gaps[source]

SlidingWindow[source]

SlidingWindowPanel[source]

identify_padding[source]

`df2Xy`[source]

`split_Xy`[source]

`df2np3d`[source]

`add_missing_value_cols`[source]

`add_missing_timestamps`[source]

`time_encoding`[source]

`delta_timestamps`[source]

`add_delta_timestamp_cols`[source]

`delta_timestamps_torch`[source]

`forward_gaps`[source]

`backward_gaps`[source]

`nearest_gaps`[source]

`all_gaps`[source]

`SlidingWindow`[source]

`SlidingWindowPanel`[source]

`identify_padding`[source]

	var
0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9
10	10
11	11
12	12
13	13
14	14
15	15
16	16
17	17
18	18
19	19

	var
0	0
1	1
2	2
3	3
4	4
5	5
6	6
7	7
8	8
9	9
10	10
11	11
12	12
13	13
14	14
15	15
16	16
17	17
18	18
19	19

	var_0	var_1	var_2
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	var_2
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900

	var_0	var_1	target
0	0	0	0
1	1	10	100
2	2	20	200
3	3	30	300
4	4	40	400
5	5	50	500
6	6	60	600
7	7	70	700
8	8	80	800
9	9	90	900