## Entity Annotation

In [1]:
import pandas as pd

df = pd.read_csv("pii_dataset.csv")
df

Unnamed: 0,gender,title,first_name,last_name,email,name_partner,gender_partner,postcode,city,street,full_address
0,Male,Mr,Imanol,Kirlin,imanol_kirlin@faulkner.com,Mila Weissnat,Female,AB10 1AB,Aberdeen,Broad Street,"Broad Street, AB10 1AB Aberdeen"
1,Female,Ms,Claudie,Rodriguez,claudierodriguez91@haas.com,Jorja Schuster,Female,IM1 1AG,Isle of Man,Circular Road,"Circular Road, IM1 1AG Isle of Man"
2,Male,Mr,Ismael,Zemlak,ismael-zemlak45@jackson-campbell.info,Jalon Glover,Male,TN34 2EZ,Hastings,Baldslow Road,"Baldslow Road, TN34 2EZ Hastings"
3,Non-Binary,Mx,Jesus,Rutherford,jesus-rutherford61@nunez.com,Martin Kihn,Male,LA22 9HA,Ambleside,Kirkfield,"Kirkfield, LA22 9HA Ambleside"
4,Female,Mrs,Leslee,Brown,leslee_brown42@mendez.org,Derrell Keebler,Male,W9 2BT,London,Shirland Road,"Shirland Road, W9 2BT London"
...,...,...,...,...,...,...,...,...,...,...,...
6068,Female,Ms,Louetta,O'Conner,louetta_o'conner@gallagher.com,Obed Terry,Male,HG4 2QN,Ripon,Bishopton Lane,"Bishopton Lane, HG4 2QN Ripon"
6069,Non-Binary,Mx,Fleet,Thompson,fleet_thompson@thompson.com,Leeann Stoltenberg,Non-Binary,EH10 4AN,Edinburgh,Falcon Avenue,"Falcon Avenue, EH10 4AN Edinburgh"
6070,Male,Mr,Pleasant,Kshlerin,pleasant.kshlerin69@leonard.org,Evelyne Bernier,Female,CM8 1SX,Witham,Holst Avenue,"Holst Avenue, CM8 1SX Witham"
6071,Non-Binary,Mx,Tilden,Dickens,tilden.dickens@alvarez.org,Savion Johns,Male,HA1 2RZ,Harrow,Rosslyn Crescent,"Rosslyn Crescent, HA1 2RZ Harrow"


In [2]:
from synthesized import MetaExtractor
from synthesized.config import AddressLabels, PersonLabels
from synthesized.metadata.value import Address, Person

address = Address(
    name="address",
    labels=AddressLabels(
        postcode="postcode", street="street", city="city", full_address="full_address"
    ),
)
person = Person(
    name="person",
    labels=PersonLabels(
        firstname="first_name", lastname="last_name", title="title", gender="gender", email="email"
    ),
)
person_partner = Person(
    name="person_partner", labels=PersonLabels(fullname="name_partner", gender="gender_partner")
)

df_meta = MetaExtractor.extract(df, annotations=[address, person, person_partner])
print(list(df_meta.children))

[<Nominal[object]: Address(name=address)>, <Nominal[object]: Person(name=person)>, <Nominal[object]: Person(name=person_partner)>]


In [3]:
from synthesized.config import HighDimConfig
from synthesized import HighDimSynthesizer

config = HighDimConfig(
    sample_addresses=False,
    address_locale="en_GB",
)

synth = HighDimSynthesizer(df_meta, config=config)
synth._df_model.children

[AddressModel(meta=<Nominal[object]: Address(name=address)>),
 PersonModel(meta=<Nominal[object]: Person(name=person)>),
 PersonModel(meta=<Nominal[object]: Person(name=person_partner)>)]

In [4]:
synth.learn(df)

Training [38;5;47m╠████████████████████╣[39m Done.


In [5]:
df_synth = synth.synthesize(100)
df_synth

Unnamed: 0,gender,title,first_name,last_name,email,name_partner,gender_partner,postcode,city,street,full_address
0,Female,Mrs,Amanda,Davies,amandadavies@hopkins-roberts.com,Josh Rogers,Male,N3B 4XJ,Andersonshire,Barton manor,"3 Barton manor, N3B 4XJ Andersonshire"
1,Female,Mrs,Charlene,Smith,charlene-smith@simpson-mitchell.biz,Abdul Buckley,Non-Binary,WA8 1YH,Robinfurt,Rees islands,"927 Rees islands, WA8 1YH Robinfurt"
2,Male,Mr,Bryan,Thompson,bryanthompson@moss.com,Christine Scott,Female,BH44 2JU,East Sam,Davies mission,"4 Davies mission, BH44 2JU East Sam"
3,Male,Mr,Robert,Sutton,robert_sutton79@evans-baker.com,Robert Jones,Male,B3 6XD,South Marianmouth,Jones coves,"28 Jones coves, B3 6XD South Marianmouth"
4,Female,Ms,Jasmine,North,jasminenorth@jackson.com,Gail Rogers,Non-Binary,B5A 0YD,Shannonmouth,Green islands,"83 Green islands, B5A 0YD Shannonmouth"
...,...,...,...,...,...,...,...,...,...,...,...
95,Female,Mrs,Lorraine,McKenzie,lorraine.mckenzie@brennan-watkins.org,Reece Smith,Male,DN24 6XZ,North Benjamin,Geoffrey groves,"0 Geoffrey groves, DN24 6XZ North Benjamin"
96,Male,Mr,John,Stephens,john_stephens@parry.org,Sean Iqbal,Non-Binary,N3K 0ZR,Talbotton,Christine springs,"20 Christine springs, N3K 0ZR Talbotton"
97,Female,Ms,Michelle,Howard,michelle_howard42@johnson.com,Craig Coates,Male,NR8 3ZF,East Clareport,Gregory prairie,"6 Gregory prairie, NR8 3ZF East Clareport"
98,Female,Ms,Valerie,Clarke,valerie_clarke@ross-benson.co.uk,Mathew Allan,Non-Binary,M65 8GE,North Georgia,Miller walk,"088 Miller walk, M65 8GE North Georgia"


## Generating fuzzed postcodes

In [15]:
from synthesized.config import HighDimConfig

config = HighDimConfig(
    learn_postcodes=True,
    address_locale="en_GB",
    postcode_level=1,
)

In [16]:
synth = HighDimSynthesizer(df_meta, config=config)
synth.learn(df, num_iterations=1)

Training Step 50 of 50 [38;5;156m╠████████████████████╣[39m 
Done.


In [17]:
df_synth = synth.synthesize(100)
df_synth

Unnamed: 0,gender,title,first_name,last_name,email,name_partner,gender_partner,postcode,city,street,full_address
0,Male,Mr,Luke,Richardson,luke_richardson@wallace.com,Karl Pugh,Male,EN9 9ZD,New Lukeville,Richardson rapids,"83 Richardson rapids, EN9 9ZD New Lukeville"
1,Female,Ms,Heather,Stevens,heather_stevens30@kerr-smith.com,Oliver Gardiner,Non-Binary,PA10 8TL,New Olivia,Kathleen crossing,"0 Kathleen crossing, PA10 8TL New Olivia"
2,Non-Binary,Mx,Stacey,Nash,staceynash97@dobson.org,Caroline Vincent,Non-Binary,IP5 6DX,Johnsonport,Thomas gateway,"955 Thomas gateway, IP5 6DX Johnsonport"
3,Non-Binary,Mx,Marc,Murphy,marc-murphy@chapman.com,Stacey Williams,Non-Binary,CH1 0RN,East Hollyfort,Shirley views,"845 Shirley views, CH1 0RN East Hollyfort"
4,Male,Mr,Benjamin,Atkinson,benjaminatkinson@middleton-wright.com,Ashley Norris,Male,LE3 9RE,Lake Ellie,Marian harbors,"2 Marian harbors, LE3 9RE Lake Ellie"
...,...,...,...,...,...,...,...,...,...,...,...
95,Male,Mr,Marc,Lyons,marclyons@barker.com,Bruce Clarke,Male,DY10 2SA,Hamiltonfort,Lucy lake,"788 Lucy lake, DY10 2SA Hamiltonfort"
96,Male,Mr,Darren,Robinson,darren-robinson11@thompson.com,Elliott Rees,Non-Binary,LA18 8GB,Wallshire,Timothy skyway,"61 Timothy skyway, LA18 8GB Wallshire"
97,Male,Mr,Edward,Miles,edwardmiles50@storey-hussain.co.uk,Hugh Powell,Male,LE3 6DX,Port Jacquelinestad,Robinson rapid,"078 Robinson rapid, LE3 6DX Port Jacquelinestad"
98,Female,Ms,Wendy,Phillips,wendyphillips@coleman.biz,Bryan Hale,Male,ZE1 5JN,Laurenborough,Bennett branch,"885 Bennett branch, ZE1 5JN Laurenborough"


In [23]:
df[["postcode", "city"]][0:4]

Unnamed: 0,postcode,city
0,AB10 1AB,Aberdeen
1,IM1 1AG,Isle of Man
2,TN34 2EZ,Hastings
3,LA22 9HA,Ambleside


In [24]:
df_address = synth._df_model.children[0].sample(conditions=df[["postcode", "city"]][0:4])
synth._df_model.children[0].meta.convert_df_for_children(df_address)
df_address

Unnamed: 0,postcode,city,street,full_address
0,LA22 8FE,Ambleside,Patel river,"978 Patel river, LA22 8FE Ambleside"
1,AB10 4ZU,Aberdeen,Atkins ridge,"79 Atkins ridge, AB10 4ZU Aberdeen"
2,IM1 6BE,Isle of Man,Brian coves,"45 Brian coves, IM1 6BE Isle of Man"
3,TN34 6SL,Hastings,Francesca forks,"3 Francesca forks, TN34 6SL Hastings"


## Extending to different Countries

In [25]:
config = HighDimConfig(address_locale="en_US")
synth = HighDimSynthesizer(df_meta, config=config)
synth.learn(df)
synth.synthesize(100)

Training [38;5;47m╠████████████████████╣[39m Done.


Unnamed: 0,gender,title,first_name,last_name,email,name_partner,gender_partner,postcode,city,street,full_address
0,Male,Mr,Luke,Willis,luke_willis35@cole.info,Tracey Crawford,Female,36685,Lambside,Tracey Ridge,"63295 Tracey Ridge, 36685 Lambside"
1,Female,Ms,Kimberley,Harding,kimberley_harding85@george.com,Danielle Robinson,Female,11445,South Elizabeth,Woods Square,"5876 Woods Square, 11445 South Elizabeth"
2,Male,Mr,Ian,Evans,ianevans31@lane.co.uk,Leigh Williams,Male,35950,Lake Stephen,Michelle Underpass,"517 Michelle Underpass, 35950 Lake Stephen"
3,Male,Mr,Stephen,Hill,stephen_hill15@boyle.com,Teresa Barnett,Female,17186,Lake Timothy,Michelle Bridge,"03930 Michelle Bridge, 17186 Lake Timothy"
4,Male,Mr,Scott,Dean,scott-dean@parker.info,Lorraine Johnson,Female,95593,Diaztown,Robert Islands,"236 Robert Islands, 95593 Diaztown"
...,...,...,...,...,...,...,...,...,...,...,...
95,Non-Binary,Mx,Amelia,Winter,amelia-winter@gordon-gilbert.com,Mitchell Burke,Male,89957,Port Denisefort,Garcia Streets,"28536 Garcia Streets, 89957 Port Denisefort"
96,Female,Mrs,Kim,Rowley,kim-rowley@jones-jones.co.uk,Gavin Hammond,Male,91960,North Savannahbury,Kari Tunnel,"985 Kari Tunnel, 91960 North Savannahbury"
97,Male,Mr,Joe,Bell,joe-bell@hartley-evans.com,Stanley Marsh,Non-Binary,69347,Crystalland,Aaron Islands,"506 Aaron Islands, 69347 Crystalland"
98,Male,Mr,Ashley,Allen,ashleyallen15@greenwood.com,Lorraine Ward,Female,23153,Pinedaville,Hunter Parks,"557 Hunter Parks, 23153 Pinedaville"


In [28]:
from synthesized.model.models.address import all_supported_locales

all_supported_locales()

['az_AZ',
 'bn_BD',
 'cs_CZ',
 'da_DK',
 'de',
 'de_AT',
 'de_CH',
 'de_DE',
 'el_GR',
 'en',
 'en_AU',
 'en_BD',
 'en_CA',
 'en_GB',
 'en_IE',
 'en_IN',
 'en_NZ',
 'en_PH',
 'en_US',
 'es',
 'es_AR',
 'es_CL',
 'es_CO',
 'es_ES',
 'es_MX',
 'fa_IR',
 'fi_FI',
 'fil_PH',
 'fr_CA',
 'fr_CH',
 'fr_FR',
 'he_IL',
 'hi_IN',
 'hr_HR',
 'hu_HU',
 'hy_AM',
 'id_ID',
 'it_IT',
 'ja_JP',
 'ka_GE',
 'ko_KR',
 'ne_NP',
 'nl_BE',
 'nl_NL',
 'no_NO',
 'pl_PL',
 'pt_BR',
 'pt_PT',
 'ro_RO',
 'ru_RU',
 'sk_SK',
 'sl_SI',
 'sv_SE',
 'ta_IN',
 'th',
 'th_TH',
 'tl_PH',
 'uk_UA',
 'zh_CN',
 'zh_TW']

In [29]:
from synthesized.model.models.address import all_supported_postcode_locales

all_supported_postcode_locales()

['az_AZ',
 'cs_CZ',
 'da_DK',
 'de_AT',
 'de_CH',
 'de_DE',
 'el_GR',
 'en_AU',
 'en_CA',
 'en_GB',
 'en_IE',
 'en_IN',
 'en_NZ',
 'en_PH',
 'en_US',
 'es_AR',
 'es_CL',
 'es_CO',
 'es_ES',
 'es_MX',
 'fi_FI',
 'fil_PH',
 'fr_CA',
 'fr_CH',
 'fr_FR',
 'he_IL',
 'hi_IN',
 'hr_HR',
 'hu_HU',
 'hy_AM',
 'id_ID',
 'it_IT',
 'ja_JP',
 'ka_GE',
 'ko_KR',
 'ne_NP',
 'nl_BE',
 'nl_NL',
 'no_NO',
 'pl_PL',
 'pt_BR',
 'pt_PT',
 'ro_RO',
 'ru_RU',
 'sk_SK',
 'ta_IN',
 'th_TH',
 'tl_PH',
 'uk_UA',
 'zh_CN',
 'zh_TW']