{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Entity Annotation" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
gendertitlefirst_namelast_nameemailname_partnergender_partnerpostcodecitystreetfull_address
0MaleMrImanolKirlinimanol_kirlin@faulkner.comMila WeissnatFemaleAB10 1ABAberdeenBroad StreetBroad Street, AB10 1AB Aberdeen
1FemaleMsClaudieRodriguezclaudierodriguez91@haas.comJorja SchusterFemaleIM1 1AGIsle of ManCircular RoadCircular Road, IM1 1AG Isle of Man
2MaleMrIsmaelZemlakismael-zemlak45@jackson-campbell.infoJalon GloverMaleTN34 2EZHastingsBaldslow RoadBaldslow Road, TN34 2EZ Hastings
3Non-BinaryMxJesusRutherfordjesus-rutherford61@nunez.comMartin KihnMaleLA22 9HAAmblesideKirkfieldKirkfield, LA22 9HA Ambleside
4FemaleMrsLesleeBrownleslee_brown42@mendez.orgDerrell KeeblerMaleW9 2BTLondonShirland RoadShirland Road, W9 2BT London
....................................
6068FemaleMsLouettaO'Connerlouetta_o'conner@gallagher.comObed TerryMaleHG4 2QNRiponBishopton LaneBishopton Lane, HG4 2QN Ripon
6069Non-BinaryMxFleetThompsonfleet_thompson@thompson.comLeeann StoltenbergNon-BinaryEH10 4ANEdinburghFalcon AvenueFalcon Avenue, EH10 4AN Edinburgh
6070MaleMrPleasantKshlerinpleasant.kshlerin69@leonard.orgEvelyne BernierFemaleCM8 1SXWithamHolst AvenueHolst Avenue, CM8 1SX Witham
6071Non-BinaryMxTildenDickenstilden.dickens@alvarez.orgSavion JohnsMaleHA1 2RZHarrowRosslyn CrescentRosslyn Crescent, HA1 2RZ Harrow
6072MaleMrLenaKilbacklena.kilback19@lowe.comRosanne TurnerFemaleLN13 0ABAlfordChristopher RoadChristopher Road, LN13 0AB Alford
\n", "

6073 rows × 11 columns

\n", "
" ], "text/plain": [ " gender title first_name last_name \\\n", "0 Male Mr Imanol Kirlin \n", "1 Female Ms Claudie Rodriguez \n", "2 Male Mr Ismael Zemlak \n", "3 Non-Binary Mx Jesus Rutherford \n", "4 Female Mrs Leslee Brown \n", "... ... ... ... ... \n", "6068 Female Ms Louetta O'Conner \n", "6069 Non-Binary Mx Fleet Thompson \n", "6070 Male Mr Pleasant Kshlerin \n", "6071 Non-Binary Mx Tilden Dickens \n", "6072 Male Mr Lena Kilback \n", "\n", " email name_partner \\\n", "0 imanol_kirlin@faulkner.com Mila Weissnat \n", "1 claudierodriguez91@haas.com Jorja Schuster \n", "2 ismael-zemlak45@jackson-campbell.info Jalon Glover \n", "3 jesus-rutherford61@nunez.com Martin Kihn \n", "4 leslee_brown42@mendez.org Derrell Keebler \n", "... ... ... \n", "6068 louetta_o'conner@gallagher.com Obed Terry \n", "6069 fleet_thompson@thompson.com Leeann Stoltenberg \n", "6070 pleasant.kshlerin69@leonard.org Evelyne Bernier \n", "6071 tilden.dickens@alvarez.org Savion Johns \n", "6072 lena.kilback19@lowe.com Rosanne Turner \n", "\n", " gender_partner postcode city street \\\n", "0 Female AB10 1AB Aberdeen Broad Street \n", "1 Female IM1 1AG Isle of Man Circular Road \n", "2 Male TN34 2EZ Hastings Baldslow Road \n", "3 Male LA22 9HA Ambleside Kirkfield \n", "4 Male W9 2BT London Shirland Road \n", "... ... ... ... ... \n", "6068 Male HG4 2QN Ripon Bishopton Lane \n", "6069 Non-Binary EH10 4AN Edinburgh Falcon Avenue \n", "6070 Female CM8 1SX Witham Holst Avenue \n", "6071 Male HA1 2RZ Harrow Rosslyn Crescent \n", "6072 Female LN13 0AB Alford Christopher Road \n", "\n", " full_address \n", "0 Broad Street, AB10 1AB Aberdeen \n", "1 Circular Road, IM1 1AG Isle of Man \n", "2 Baldslow Road, TN34 2EZ Hastings \n", "3 Kirkfield, LA22 9HA Ambleside \n", "4 Shirland Road, W9 2BT London \n", "... ... \n", "6068 Bishopton Lane, HG4 2QN Ripon \n", "6069 Falcon Avenue, EH10 4AN Edinburgh \n", "6070 Holst Avenue, CM8 1SX Witham \n", "6071 Rosslyn Crescent, HA1 2RZ Harrow \n", "6072 Christopher Road, LN13 0AB Alford \n", "\n", "[6073 rows x 11 columns]" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(\"pii_dataset.csv\")\n", "df" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from synthesized.config import PersonLabels\n", "from synthesized.metadata.value import Person\n", "\n", "person_annot = Person(name=\"person\", labels=PersonLabels(firstname=\"first_name\", lastname=\"last_name\", title=\"title\", email=\"email\"))\n", "person_partner_annot = Person(name=\"person_partner\", labels=PersonLabels(fullname=\"name_partner\", gender=\"gender_partner\"))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from synthesized.config import AddressLabels\n", "from synthesized.metadata.value import Address\n", "\n", "address_annot = Address(\n", " name=\"address\",\n", " labels=AddressLabels(\n", " postcode=\"postcode\",\n", " street=\"street\",\n", " city=\"city\",\n", " full_address=\"full_address\"\n", " ),\n", " locales=\"en_GB\",\n", " )" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from synthesized import MetaExtractor\n", "\n", "df_meta = MetaExtractor.extract(\n", " df=df,\n", " annotations=\n", " [\n", " person_annot,\n", " person_partner_annot,\n", " address_annot\n", " ]\n", " )" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from synthesized.config import HighDimConfig\n", "from synthesized import HighDimSynthesizer\n", "\n", "config = HighDimConfig(\n", " learn_postcodes=True,\n", " postcode_level=1,\n", ")\n", "\n", "synth = HighDimSynthesizer(df_meta, config=config)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[38;5;98mYou're currently on the free version of the Synthesized SDK. For full access to all of our features contact the Synthesized team.\u001b[39m\n", "Training \u001b[38;5;47m╠████████████████████╣\u001b[39m Done.\n" ] } ], "source": [ "synth.learn(df)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[38;5;98mYou're currently on the free version of the Synthesized SDK. For full access to all of our features contact the Synthesized team.\u001b[39m\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
gendertitlefirst_namelast_nameemailname_partnergender_partnerpostcodecitystreetfull_address
0FemaleMrConnorSmartconnorsmart42@watson-morgan.comEdward PatelMaleCA35XWCarlisleThomas bridgeFlat 22 97 Thomas bridge, CA35XW Carlisle West...
1FemaleMsSaraArchersara.archer35@bennett.comAdam GrantMaleSK100GUMacclesfieldJones crossingFlat 01 8 Jones crossing, SK100GU Macclesfield...
2FemaleMrsAliceFosteralice.foster@ferguson.comJoanne HarrisFemaleBH236ZRChristchurchMalcolm lightFlat 13F 35 Malcolm light, BH236ZR Christchurc...
3Non-BinaryMrRogerMiahrogermiah68@perkins-henderson.netLorraine ConnollyFemaleSN156XBChippenhamStephanie divideStudio 92Q 839 Stephanie divide, SN156XB Chipp...
4MaleMrThomasWhitethomas-white@carter.comPaige PriceFemaleS425UZChesterfieldThompson streetsFlat 21 10 Thompson streets, S425UZ Chesterfie...
5MaleMrDerekRicederek-rice@hart-mason.comBeth BoyleFemaleCB239JXCambridgeAlexander pineFlat 5 180 Alexander pine, CB239JX Cambridge E...
6FemaleMrsJosephineMilesjosephine_miles@shaw.comToby NolanMaleWF136JWDewsburyStanley forksFlat 92 132 Stanley forks, WF136JW Dewsbury Ci...
7Non-BinaryMrFrederickPottsfrederick-potts75@perry-brennan.comJane WilliamsFemaleTR135SGHelstonPamela courtsStudio 9 1 Pamela courts, TR135SG Helston Dorset
8Non-BinaryMrCraigMitchellcraig-mitchell@warner-jordan.comAaron BegumNon-BinaryLE44SJLeicesterCraig glensFlat 5 9 Craig glens, LE44SJ Leicester Angus
9FemaleMrHughWalshhugh.walsh48@clarke.comGraeme NichollsMaleDD100RQMontroseJones trafficwayFlat 95 721 Jones trafficway, DD100RQ Montrose...
\n", "
" ], "text/plain": [ " gender title first_name last_name email \\\n", "0 Female Mr Connor Smart connorsmart42@watson-morgan.com \n", "1 Female Ms Sara Archer sara.archer35@bennett.com \n", "2 Female Mrs Alice Foster alice.foster@ferguson.com \n", "3 Non-Binary Mr Roger Miah rogermiah68@perkins-henderson.net \n", "4 Male Mr Thomas White thomas-white@carter.com \n", "5 Male Mr Derek Rice derek-rice@hart-mason.com \n", "6 Female Mrs Josephine Miles josephine_miles@shaw.com \n", "7 Non-Binary Mr Frederick Potts frederick-potts75@perry-brennan.com \n", "8 Non-Binary Mr Craig Mitchell craig-mitchell@warner-jordan.com \n", "9 Female Mr Hugh Walsh hugh.walsh48@clarke.com \n", "\n", " name_partner gender_partner postcode city street \\\n", "0 Edward Patel Male CA35XW Carlisle Thomas bridge \n", "1 Adam Grant Male SK100GU Macclesfield Jones crossing \n", "2 Joanne Harris Female BH236ZR Christchurch Malcolm light \n", "3 Lorraine Connolly Female SN156XB Chippenham Stephanie divide \n", "4 Paige Price Female S425UZ Chesterfield Thompson streets \n", "5 Beth Boyle Female CB239JX Cambridge Alexander pine \n", "6 Toby Nolan Male WF136JW Dewsbury Stanley forks \n", "7 Jane Williams Female TR135SG Helston Pamela courts \n", "8 Aaron Begum Non-Binary LE44SJ Leicester Craig glens \n", "9 Graeme Nicholls Male DD100RQ Montrose Jones trafficway \n", "\n", " full_address \n", "0 Flat 22 97 Thomas bridge, CA35XW Carlisle West... \n", "1 Flat 01 8 Jones crossing, SK100GU Macclesfield... \n", "2 Flat 13F 35 Malcolm light, BH236ZR Christchurc... \n", "3 Studio 92Q 839 Stephanie divide, SN156XB Chipp... \n", "4 Flat 21 10 Thompson streets, S425UZ Chesterfie... \n", "5 Flat 5 180 Alexander pine, CB239JX Cambridge E... \n", "6 Flat 92 132 Stanley forks, WF136JW Dewsbury Ci... \n", "7 Studio 9 1 Pamela courts, TR135SG Helston Dorset \n", "8 Flat 5 9 Craig glens, LE44SJ Leicester Angus \n", "9 Flat 95 721 Jones trafficway, DD100RQ Montrose... " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_synth = synth.synthesize(10)\n", "df_synth" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" } }, "nbformat": 4, "nbformat_minor": 4 }