Type 3: Event Data (Irregular Time-Series)

Possible actions:

  • Synthesize new events

  • Synthesize the same panel members

  • Synthesize new panel members

import pandas as pd
from synthesized import HighDimSynthesizer, MetaExtractor
df = pd.read_csv("bank_transactions.csv")
df["DATE"] = pd.to_datetime(df["DATE"])
df = df.convert_dtypes()
print(df)
                     DATE Account No  DEPOSIT AMT  WITHDRAWAL AMT    BALANCE AMT
0     2018-01-01 04:44:58     ABC002    1000000.0            <NA>      1000000.0
1     2018-01-01 06:18:58     ABC001     200000.0            <NA>       200000.0
2     2018-01-02 03:46:21     ABC002     300000.0            <NA>      1300000.0
3     2018-01-02 11:57:04     ABC001         <NA>       5000000.0     -4800000.0
4     2018-01-03 02:47:56     ABC002     800106.0            <NA>      2100106.0
                      ...        ...          ...             ...            ...
75138 2020-12-30 11:10:36     ABC004    104209.71            <NA>  -3635697000.0
75139 2020-12-30 12:27:04     ABC003    3000000.0            <NA>  10698427000.0
75140 2020-12-30 15:50:58     ABC005         <NA>       15991.934  -1503678200.0
75141 2020-12-30 16:39:08     ABC002         <NA>          4000.0  -9969822000.0
75142 2020-12-30 18:52:12     ABC004         <NA>        300043.7  -3635997000.0

[75143 rows x 5 columns]
dfb = df.groupby(df["Account No"], group_keys=False).apply(
    lambda x: x.assign(
        BALANCE_DIFF=x["BALANCE AMT"].diff().fillna(x["BALANCE AMT"].iloc[0])
    ).drop(
        columns=["WITHDRAWAL AMT", "DEPOSIT AMT", "BALANCE AMT"]
    )
).sort_index().rename(columns=lambda x: x.replace("_", " "))
print(dfb)
                     DATE Account No  BALANCE DIFF
0     2018-01-01 04:44:58     ABC002     1000000.0
1     2018-01-01 06:18:58     ABC001      200000.0
2     2018-01-02 03:46:21     ABC002      300000.0
3     2018-01-02 11:57:04     ABC001    -5000000.0
4     2018-01-03 02:47:56     ABC002      800106.0
                      ...        ...           ...
75138 2020-12-30 11:10:36     ABC004      104000.0
75139 2020-12-30 12:27:04     ABC003     3000000.0
75140 2020-12-30 15:50:58     ABC005      -16000.0
75141 2020-12-30 16:39:08     ABC002       -4000.0
75142 2020-12-30 18:52:12     ABC004     -300000.0

[75143 rows x 3 columns]
df_meta = MetaExtractor.extract(dfb)
from synthesized.model import DataFrameModel
DataFrameModel(df_meta).fit(dfb).plot();
transactional data 4 0
synth = HighDimSynthesizer(df_meta)
synth.learn(df_train=dfb)
df_synth = synth.synthesize(num_rows=len(dfb))
df_synth = df_synth.set_index("DATE").sort_index().reset_index()
df_synth = df_synth.groupby(df_synth["Account No"], group_keys=False).apply(
    lambda x: x.assign(
        DEPOSIT_AMT=x["BALANCE DIFF"].where(lambda x: x > 0, other=pd.NA),
        WITHDRAWAL_AMT=-x["BALANCE DIFF"].where(lambda x: x < 0, other=pd.NA),
        BALANCE_AMT=x["BALANCE DIFF"].cumsum()
    ).drop(
        columns=["BALANCE DIFF"]
    )
).sort_index()
print(df_synth)
                     DATE Account No   DEPOSIT AMT  WITHDRAWAL AMT   BALANCE AMT
0     2018-01-01 04:44:58     ABC002  1.000000e+06             NaN  1.000000e+06
1     2018-01-01 06:18:58     ABC001  2.000000e+05             NaN  2.000000e+05
2     2018-01-02 03:46:21     ABC002  3.000000e+05             NaN  1.300000e+06
3     2018-01-02 11:57:04     ABC001           NaN    5.000000e+06 -4.800000e+06
4     2018-01-03 02:47:56     ABC002  8.001060e+05             NaN  2.100106e+06
                      ...        ...           ...             ...           ...
75138 2020-12-30 11:10:36     ABC004  1.042097e+05             NaN -3.635697e+09
75139 2020-12-30 12:27:04     ABC003  3.000000e+06             NaN  1.069843e+10
75140 2020-12-30 15:50:58     ABC005           NaN    1.599193e+04 -1.503678e+09
75141 2020-12-30 16:39:08     ABC002           NaN    4.000000e+03 -9.969822e+09
75142 2020-12-30 18:52:12     ABC004           NaN    3.000437e+05 -3.635997e+09

[75143 rows x 5 columns]