Type 3: Event Data (Irregular Time-Series)
Possible actions:
|
import pandas as pd
from synthesized import HighDimSynthesizer, MetaExtractor
df = pd.read_csv("bank_transactions.csv")
df["DATE"] = pd.to_datetime(df["DATE"])
df = df.convert_dtypes()
print(df)
DATE Account No DEPOSIT AMT WITHDRAWAL AMT BALANCE AMT 0 2018-01-01 04:44:58 ABC002 1000000.0 <NA> 1000000.0 1 2018-01-01 06:18:58 ABC001 200000.0 <NA> 200000.0 2 2018-01-02 03:46:21 ABC002 300000.0 <NA> 1300000.0 3 2018-01-02 11:57:04 ABC001 <NA> 5000000.0 -4800000.0 4 2018-01-03 02:47:56 ABC002 800106.0 <NA> 2100106.0 ... ... ... ... ... 75138 2020-12-30 11:10:36 ABC004 104209.71 <NA> -3635697000.0 75139 2020-12-30 12:27:04 ABC003 3000000.0 <NA> 10698427000.0 75140 2020-12-30 15:50:58 ABC005 <NA> 15991.934 -1503678200.0 75141 2020-12-30 16:39:08 ABC002 <NA> 4000.0 -9969822000.0 75142 2020-12-30 18:52:12 ABC004 <NA> 300043.7 -3635997000.0 [75143 rows x 5 columns]
dfb = df.groupby(df["Account No"], group_keys=False).apply(
lambda x: x.assign(
BALANCE_DIFF=x["BALANCE AMT"].diff().fillna(x["BALANCE AMT"].iloc[0])
).drop(
columns=["WITHDRAWAL AMT", "DEPOSIT AMT", "BALANCE AMT"]
)
).sort_index().rename(columns=lambda x: x.replace("_", " "))
print(dfb)
DATE Account No BALANCE DIFF 0 2018-01-01 04:44:58 ABC002 1000000.0 1 2018-01-01 06:18:58 ABC001 200000.0 2 2018-01-02 03:46:21 ABC002 300000.0 3 2018-01-02 11:57:04 ABC001 -5000000.0 4 2018-01-03 02:47:56 ABC002 800106.0 ... ... ... 75138 2020-12-30 11:10:36 ABC004 104000.0 75139 2020-12-30 12:27:04 ABC003 3000000.0 75140 2020-12-30 15:50:58 ABC005 -16000.0 75141 2020-12-30 16:39:08 ABC002 -4000.0 75142 2020-12-30 18:52:12 ABC004 -300000.0 [75143 rows x 3 columns]
df_meta = MetaExtractor.extract(dfb)
from synthesized.model import DataFrameModel
DataFrameModel(df_meta).fit(dfb).plot();

synth = HighDimSynthesizer(df_meta)
synth.learn(df_train=dfb)
df_synth = synth.synthesize(num_rows=len(dfb))
df_synth = df_synth.set_index("DATE").sort_index().reset_index()
df_synth = df_synth.groupby(df_synth["Account No"], group_keys=False).apply(
lambda x: x.assign(
DEPOSIT_AMT=x["BALANCE DIFF"].where(lambda x: x > 0, other=pd.NA),
WITHDRAWAL_AMT=-x["BALANCE DIFF"].where(lambda x: x < 0, other=pd.NA),
BALANCE_AMT=x["BALANCE DIFF"].cumsum()
).drop(
columns=["BALANCE DIFF"]
)
).sort_index()
print(df_synth)
DATE Account No DEPOSIT AMT WITHDRAWAL AMT BALANCE AMT 0 2018-01-01 04:44:58 ABC002 1.000000e+06 NaN 1.000000e+06 1 2018-01-01 06:18:58 ABC001 2.000000e+05 NaN 2.000000e+05 2 2018-01-02 03:46:21 ABC002 3.000000e+05 NaN 1.300000e+06 3 2018-01-02 11:57:04 ABC001 NaN 5.000000e+06 -4.800000e+06 4 2018-01-03 02:47:56 ABC002 8.001060e+05 NaN 2.100106e+06 ... ... ... ... ... 75138 2020-12-30 11:10:36 ABC004 1.042097e+05 NaN -3.635697e+09 75139 2020-12-30 12:27:04 ABC003 3.000000e+06 NaN 1.069843e+10 75140 2020-12-30 15:50:58 ABC005 NaN 1.599193e+04 -1.503678e+09 75141 2020-12-30 16:39:08 ABC002 NaN 4.000000e+03 -9.969822e+09 75142 2020-12-30 18:52:12 ABC004 NaN 3.000437e+05 -3.635997e+09 [75143 rows x 5 columns]