From 9cd21fa12423f2174c37ab5d973da8ecb63a78d5 Mon Sep 17 00:00:00 2001 From: artkulak Date: Sat, 5 Feb 2022 09:34:10 +0600 Subject: [PATCH] Old version --- examples/pandas.ipynb | 1745 ----------------------------------------- 1 file changed, 1745 deletions(-) delete mode 100644 examples/pandas.ipynb diff --git a/examples/pandas.ipynb b/examples/pandas.ipynb deleted file mode 100644 index 693c1ba..0000000 --- a/examples/pandas.ipynb +++ /dev/null @@ -1,1745 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "examples", - "provenance": [], - "collapsed_sections": [ - "6YdsVXnwHgfc" - ] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# Installation" - ], - "metadata": { - "id": "6YdsVXnwHgfc" - } - }, - { - "cell_type": "code", - "source": [ - "!pip install cape-privacy==0.3.0 --no-deps" - ], - "metadata": { - "id": "5hfF6zgl8MUL" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "!pip install anonympy==0.2.0" - ], - "metadata": { - "id": "jo7icbdeoVFe" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#Importing and Initializing" - ], - "metadata": { - "id": "l8hVgsTQsFlL" - } - }, - { - "cell_type": "code", - "source": [ - "from anonympy.pandas import dfAnonymizer\n", - "from anonympy.pandas.utils import load_dataset" - ], - "metadata": { - "id": "GcGQR3FW43iP" - }, - "execution_count": 43, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "# specify datettime columns\n", - "df = load_dataset('big')\n", - "\n", - "anonym = dfAnonymizer(df)\n", - "\n", - "anonym" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "C6MnzoxsBiSr", - "outputId": "02f9b5c3-1344-45a1-9acc-3ec6bb6b010a" - }, - "execution_count": 78, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "+-------------------------------+\n", - "| Total number of columns: 10 |\n", - "+===============================+\n", - "| Anonymized Column -> Method: |\n", - "+-------------------------------+\n", - "| Unanonymized Columns: |\n", - "| - first_name |\n", - "| - address |\n", - "| - city |\n", - "| - postal |\n", - "| - phone |\n", - "| - email |\n", - "| - web |\n", - "| - salary |\n", - "| - birthdate |\n", - "| - age |\n", - "+-------------------------------+" - ] - }, - "metadata": {}, - "execution_count": 78 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "

So, we have got 10 columns. None of them are yet anonymized

" - ], - "metadata": { - "id": "9VuuD4VuCF9U" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Check dtypes" - ], - "metadata": { - "id": "DdSuJuuLHoYl" - } - }, - { - "cell_type": "markdown", - "source": [ - "

Since anonymization methods depend on data type, it's recommended to check each column data type before applying any functions

" - ], - "metadata": { - "id": "DquwQyMXC8kC" - } - }, - { - "cell_type": "code", - "source": [ - "print('Rows:', df.shape[0],'\\nColumns:', df.shape[1])\n", - "df.head()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 374 - }, - "id": "iy1MFIEzDVmH", - "outputId": "c53f60a6-aaec-423d-affd-e12a2241792b" - }, - "execution_count": 45, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Rows: 500 \n", - "Columns: 10\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
first_nameaddresscitypostalphoneemailwebsalarybirthdateage
0Aleshia14 Taylor StSt. Stephens WardCT2 7PP01835-703597atomkiewicz@hotmail.comhttp://www.alandrosenburgcpapc.co.uk463912000-12-23 15:09:18.11747520021
1Evan5 Binney StAbbey WardHP11 2AX01937-864715evan.zigomalas@gmail.comhttp://www.capgeminiamerica.co.uk307982004-04-22 04:09:51.32594880017
2France8 Moor PlaceEast Southbourne and Tuckton WBH6 3BE01347-368222france.andrade@hotmail.comhttp://www.elliottjohnwesq.co.uk323842002-01-21 18:56:29.09002560019
3Ulysses505 Exeter RdHawerby cum BeesbyDN36 5RP01912-771311ulysses@hotmail.comhttp://www.mcmahanbenl.co.uk392982000-11-24 21:59:48.62184000021
4Tyisha5396 Forth StreetGreets Green and Lyng WardB70 9DT01547-429341tyisha.veness@hotmail.comhttp://www.champagneroom.co.uk416301998-06-23 05:19:37.68700800023
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " first_name address ... birthdate age\n", - "0 Aleshia 14 Taylor St ... 2000-12-23 15:09:18.117475200 21\n", - "1 Evan 5 Binney St ... 2004-04-22 04:09:51.325948800 17\n", - "2 France 8 Moor Place ... 2002-01-21 18:56:29.090025600 19\n", - "3 Ulysses 505 Exeter Rd ... 2000-11-24 21:59:48.621840000 21\n", - "4 Tyisha 5396 Forth Street ... 1998-06-23 05:19:37.687008000 23\n", - "\n", - "[5 rows x 10 columns]" - ] - }, - "metadata": {}, - "execution_count": 45 - } - ] - }, - { - "cell_type": "code", - "source": [ - "print('Categorical Columns: ', anonym.categorical_columns)\n", - "print('Numeric Columns: ', anonym.numeric_columns)\n", - "print('Datetime Columns: ', anonym.datetime_columns)\n", - "\n", - "print('\\nOr Call `info` method\\n ')\n", - "\n", - "anonym.info()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "IheXU6FtC19U", - "outputId": "0dbd3eef-7621-48d1-b42c-e0227f82f6fb" - }, - "execution_count": 46, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Categorical Columns: ['first_name', 'address', 'city', 'postal', 'phone', 'email', 'web']\n", - "Numeric Columns: ['salary', 'age']\n", - "Datetime Columns: ['birthdate']\n", - "\n", - "Or Call `info` method\n", - " \n", - "+------------+--------+-------------+--------+\n", - "| Column | Status | Type | Method |\n", - "+============+========+=============+========+\n", - "| first_name | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| address | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| city | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| postal | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| phone | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| email | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| web | 0 | categorical | |\n", - "+------------+--------+-------------+--------+\n", - "| salary | 0 | numeric | |\n", - "+------------+--------+-------------+--------+\n", - "| birthdate | 0 | datetime | |\n", - "+------------+--------+-------------+--------+\n", - "| age | 0 | numeric | |\n", - "+------------+--------+-------------+--------+\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "

7 categorical, 2 numerical and 1 datetime columns. \n", - "Let's see what methods we can apply to categorical columns.\n", - "
List of available methods: anonympy.pandas.utils.available_methods\n", - "

" - ], - "metadata": { - "id": "9jBf13NXD8RS" - } - }, - { - "cell_type": "code", - "source": [ - "from anonympy.pandas.utils import available_methods" - ], - "metadata": { - "id": "e1Yey5BGDuLG" - }, - "execution_count": 47, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "available_methods('categorical') # args: 'categorical' / 'numerical' / 'datetime' / 'general' / None" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "t2gqFZdoEU9K", - "outputId": "843b2a43-2d99-438b-8023-74e012e09dfd" - }, - "execution_count": 48, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "categorical_fake\tcategorical_fake_auto\tcategorical_resampling\tcategorical_tokenization\tcategorical_email_masking\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Categorical Email Masking\n", - "\n", - "

To apply partial email masking call categorical_email_masking on corresponding column

" - ], - "metadata": { - "id": "vIr7XFBoSoDh" - } - }, - { - "cell_type": "code", - "source": [ - "anonym.categorical_email_masking(columns='email', inplace = False)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Lxbx7cmbSquq", - "outputId": "32075f7e-76d3-45e0-9865-124ba8bd147a" - }, - "execution_count": 49, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0 a*****z@hotmail.com\n", - "1 e*****s@gmail.com\n", - "2 f*****e@hotmail.com\n", - "3 u*****s@hotmail.com\n", - "4 t*****s@hotmail.com\n", - " ... \n", - "495 a*****y@veit.co.uk\n", - "496 r*****i@euresti.co.uk\n", - "497 c*****g@brenning.co.uk\n", - "498 c*****y@gmail.com\n", - "499 m*****i@hotmail.com\n", - "Name: email, Length: 500, dtype: object" - ] - }, - "metadata": {}, - "execution_count": 49 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "`inplace=True` (default) - changes will be applied to the dataframe. Access using `anonym.to_df()`\n", - "
\n", - "`inplace = False` - return the changes" - ], - "metadata": { - "id": "ma_Bf0IATgyc" - } - }, - { - "cell_type": "code", - "source": [ - "anonym.categorical_email_masking('email') # inplace = True\n", - "\n", - "print(anonym.anonymized_columns, '\\n')\n", - "\n", - "print(anonym)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "u--sEHaXTfMs", - "outputId": "ed34ea67-c8a5-42ac-ec38-3944eb191851" - }, - "execution_count": 50, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "['email'] \n", - "\n", - "+-------------------------------+\n", - "| Total number of columns: 10 |\n", - "+===============================+\n", - "| Anonymized Column -> Method: |\n", - "| - email -> Partial Masking |\n", - "+-------------------------------+\n", - "| Unanonymized Columns: |\n", - "| - first_name |\n", - "| - address |\n", - "| - city |\n", - "| - postal |\n", - "| - phone |\n", - "| - web |\n", - "| - salary |\n", - "| - birthdate |\n", - "| - age |\n", - "+-------------------------------+\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Now **email** column appears to be anonymized" - ], - "metadata": { - "id": "wYK1onhthhc-" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Categorical Synthetic data (fake data) \n", - "

Next, let's replace some values with synthetically generated ones. \n", - "
If column name is same as the corresponding faker's method, use categorical_fake_auto method.\n", - "
List of faker's methods: anonympy.pandas.utils.fake_methods\n", - "

" - ], - "metadata": { - "id": "JCgS0Zr0F_ni" - } - }, - { - "cell_type": "code", - "source": [ - "from anonympy.pandas.utils import fake_methods" - ], - "metadata": { - "id": "5nn0FVkWEyd0" - }, - "execution_count": 51, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "fake_methods() # args: letter / None/ 'all'\n", - "# output in the following screenshot" - ], - "metadata": { - "id": "0t36isKIHgd-", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "outputId": "bd13d825-3d93-4ec6-890c-72fd239fb9b9" - }, - "execution_count": 52, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "A | aba, address, administrative_unit, am_pm, android_platform_token, ascii_company_email, ascii_email, ascii_free_email, ascii_safe_email\n", - "B | bank_country, bban, boolean, bothify, bs, building_number\n", - "C | cache_pattern, catch_phrase, century, chrome, city, city_prefix, city_suffix, color, color_name, company, company_email, company_suffix, coordinate, country, country_calling_code, country_code, credit_card_expire, credit_card_full, credit_card_number, credit_card_provider, credit_card_security_code, cryptocurrency, cryptocurrency_code, cryptocurrency_name, csv, currency, currency_code, currency_name, currency_symbol, current_country, current_country_code\n", - "D | date, date_between, date_between_dates, date_object, date_of_birth, date_this_century, date_this_decade, date_this_month, date_this_year, date_time, date_time_ad, date_time_between, date_time_between_dates, date_time_this_century, date_time_this_decade, date_time_this_month, date_time_this_year, day_of_month, day_of_week, del_arguments, dga, domain_name, domain_word, dsv\n", - "E | ean, ean13, ean8, ein, email\n", - "F | factories, file_extension, file_name, file_path, firefox, first_name, first_name_female, first_name_male, first_name_nonbinary, fixed_width, format, free_email, free_email_domain, future_date, future_datetime\n", - "G | generator_attrs, get_arguments, get_formatter, get_providers\n", - "H | hex_color, hexify, hostname, http_method\n", - "I | iana_id, iban, image, image_url, internet_explorer, invalid_ssn, ios_platform_token, ipv4, ipv4_network_class, ipv4_private, ipv4_public, ipv6, isbn10, isbn13, iso8601, items, itin\n", - "J | job, json\n", - "L | language_code, language_name, last_name, last_name_female, last_name_male, last_name_nonbinary, latitude, latlng, lexify, license_plate, linux_platform_token, linux_processor, local_latlng, locale, locales, localized_ean, localized_ean13, localized_ean8, location_on_land, longitude\n", - "M | mac_address, mac_platform_token, mac_processor, md5, military_apo, military_dpo, military_ship, military_state, mime_type, month, month_name, msisdn\n", - "N | name, name_female, name_male, name_nonbinary, nic_handle, nic_handles, null_boolean, numerify\n", - "O | opera\n", - "P | paragraph, paragraphs, parse, password, past_date, past_datetime, phone_number, port_number, postalcode, postalcode_in_state, postalcode_plus4, postcode, postcode_in_state, prefix, prefix_female, prefix_male, prefix_nonbinary, pricetag, profile, provider, providers, psv, pybool, pydecimal, pydict, pyfloat, pyint, pyiterable, pylist, pyset, pystr, pystr_format, pystruct, pytimezone, pytuple\n", - "R | random, random_choices, random_digit, random_digit_not_null, random_digit_not_null_or_empty, random_digit_or_empty, random_element, random_elements, random_int, random_letter, random_letters, random_lowercase_letter, random_number, random_sample, random_uppercase_letter, randomize_nb_elements, rgb_color, rgb_css_color, ripe_id\n", - "S | safari, safe_color_name, safe_domain_name, safe_email, safe_hex_color, secondary_address, seed_instance, seed_locale, sentence, sentences, set_arguments, set_formatter, sha1, sha256, simple_profile, slug, ssn, state, state_abbr, street_address, street_name, street_suffix, suffix, suffix_female, suffix_male, suffix_nonbinary, swift, swift11, swift8\n", - "T | tar, text, texts, time, time_delta, time_object, time_series, timezone, tld, tsv\n", - "U | unique, unix_device, unix_partition, unix_time, upc_a, upc_e, uri, uri_extension, uri_page, uri_path, url, user_agent, user_name, uuid4\n", - "W | weights, windows_platform_token, word, words\n", - "Y | year\n", - "Z | zipcode, zipcode_in_state, zipcode_plus4\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "As we can see from this list, column names: **address**, **city** and **first_name** correspond to faker's methods. So we can run `categorical_fake_auto` and it will anonymize these 3 columns\n", - "* `email` method is also there, but since we already anonymized it, it won't get affected.\n", - "\n", - "![image.png]()" - ], - "metadata": { - "id": "Efy7zl9xJjGh" - } - }, - { - "cell_type": "code", - "source": [ - "anonym.categorical_fake_auto() # default args: locale='en_US', inplace=True\n", - "\n", - "# we could use `categorical_fake` method as well, with specifiying the columns\n", - "# anonym.categorical_fake(['first_name', 'address', 'city'])\n", - "\n", - "print(anonym)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1h4uErHyJoCZ", - "outputId": "c83dd408-c618-478f-8a6b-2193df46afba" - }, - "execution_count": 53, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "`email` column already anonymized!\n", - "+--------------------------------+\n", - "| Total number of columns: 10 |\n", - "+================================+\n", - "| Anonymized Column -> Method: |\n", - "| - email -> Partial Masking |\n", - "| - first_name -> Synthetic Data |\n", - "| - address -> Synthetic Data |\n", - "| - city -> Synthetic Data |\n", - "+--------------------------------+\n", - "| Unanonymized Columns: |\n", - "| - postal |\n", - "| - phone |\n", - "| - web |\n", - "| - salary |\n", - "| - birthdate |\n", - "| - age |\n", - "+--------------------------------+\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Let's call `fake_methods` for letter 'u'" - ], - "metadata": { - "id": "ezXj21Kwj5xk" - } - }, - { - "cell_type": "code", - "source": [ - "fake_methods('u')" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "g-fVpOSVj5Or", - "outputId": "fac57759-8a09-4b5b-bf48-9e2063f9f70c" - }, - "execution_count": 54, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " unique, unix_device, unix_partition, unix_time, upc_a, upc_e, uri, uri_extension, uri_page, uri_path, url, user_agent, user_name, uuid4\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "In our dataframe '**web**' column column stored different **urls**. So we can use `url` method on '**web**' column. \n", - "\n", - "And column '**phone**' values can be replaced using `phone_number` method\n", - "\n", - "Let's apply these changes using `categorical_fake` method which accepts list of column names (if similar to method name) or a dictionary (when names differ)" - ], - "metadata": { - "id": "1GSUKASskUMN" - } - }, - { - "cell_type": "code", - "source": [ - "anonym.categorical_fake({'web':'url', 'phone': 'phone_number'}) # inplace = True\n", - "\n", - "anonym.info()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "9xOiIlg0koNv", - "outputId": "d52a595e-d196-47e6-b282-f8113139b1b6" - }, - "execution_count": 55, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------------+--------+-------------+-----------------+\n", - "| Column | Status | Type | Method |\n", - "+============+========+=============+=================+\n", - "| first_name | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------+\n", - "| address | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------+\n", - "| city | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------+\n", - "| postal | 0 | categorical | |\n", - "+------------+--------+-------------+-----------------+\n", - "| phone | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------+\n", - "| email | 1 | categorical | Partial Masking |\n", - "+------------+--------+-------------+-----------------+\n", - "| web | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------+\n", - "| salary | 0 | numeric | |\n", - "+------------+--------+-------------+-----------------+\n", - "| birthdate | 0 | datetime | |\n", - "+------------+--------+-------------+-----------------+\n", - "| age | 0 | numeric | |\n", - "+------------+--------+-------------+-----------------+\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Categorical Tokenization\n" - ], - "metadata": { - "id": "NKODUxOmn1fk" - } - }, - { - "cell_type": "markdown", - "source": [ - "Let's apply tokenization on '**postal**' column" - ], - "metadata": { - "id": "md6pqB-boElC" - } - }, - { - "cell_type": "code", - "source": [ - "print(anonym.categorical_tokenization('postal', inplace = False)) # first inplace = False, to see how the changes will look\n", - "\n", - "anonym.categorical_tokenization('postal') # inplace = True, by default" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "VhHUGjftoA-U", - "outputId": "eaf39bdd-562a-41a8-aaa7-d5398e3b7e7b" - }, - "execution_count": 56, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0 2a8ba32381\n", - "1 1569763104\n", - "2 73c0b907be\n", - "3 b155a02362\n", - "4 56a6b1e03a\n", - " ... \n", - "495 7687741a49\n", - "496 6a16bbf755\n", - "497 bedd62bb14\n", - "498 30dae3be6e\n", - "499 7c49b8248b\n", - "Name: postal, Length: 500, dtype: object\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# which columns are left unanonymized? \n", - "print(anonym.unanonymized_columns)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rO1SI4H2pXyr", - "outputId": "0eeb5fe6-d51e-4b22-ae6a-e8a531c50097" - }, - "execution_count": 58, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "['salary', 'birthdate', 'age']\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Datetime Perturbation (noise)" - ], - "metadata": { - "id": "x301iHA7peji" - } - }, - { - "cell_type": "markdown", - "source": [ - "Let's add some random noise to '**birthdate**' column\n", - "
Also, we should specify to add noise only to days and month but not year." - ], - "metadata": { - "id": "NWLhBATkpk20" - } - }, - { - "cell_type": "code", - "source": [ - "print(anonym.datetime_noise('birthdate', frequency=('MONTH', 'DAY'), inplace = False), '\\n') # inplace = False to observe the changes\n", - "\n", - "print(df.birthdate) # for comparison" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mcS3tL6kpgd3", - "outputId": "2be407a8-2f95-4d57-feb0-85ee8f3702ce" - }, - "execution_count": 65, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0 2000-03-30 15:09:18.117475200\n", - "1 2004-05-18 04:09:51.325948800\n", - "2 2002-07-15 18:56:29.090025600\n", - "3 2000-11-19 21:59:48.621840000\n", - "4 1998-05-20 05:19:37.687008000\n", - " ... \n", - "495 1994-09-15 16:40:58.379318400\n", - "496 1998-11-08 11:23:56.188204800\n", - "497 1998-06-12 22:03:29.331331200\n", - "498 1995-02-15 21:48:38.237414400\n", - "499 2000-10-12 20:38:06.739699200\n", - "Name: birthdate, Length: 500, dtype: datetime64[ns] \n", - "\n", - "0 2000-12-23 15:09:18.117475200\n", - "1 2004-04-22 04:09:51.325948800\n", - "2 2002-01-21 18:56:29.090025600\n", - "3 2000-11-24 21:59:48.621840000\n", - "4 1998-06-23 05:19:37.687008000\n", - " ... \n", - "495 1995-06-08 16:40:58.379318400\n", - "496 1999-02-10 11:23:56.188204800\n", - "497 1998-01-13 22:03:29.331331200\n", - "498 1994-12-20 21:48:38.237414400\n", - "499 2000-02-13 20:38:06.739699200\n", - "Name: birthdate, Length: 500, dtype: datetime64[ns]\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "anonym.datetime_noise('birthdate', frequency=('MONTH', 'DAY')) # inplace=True, to apply the changes" - ], - "metadata": { - "id": "rKmHk_aVqv3f" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "anonym.info()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "mWsbXFSCq1FY", - "outputId": "63581894-6f72-4517-f1b0-29398f892db7" - }, - "execution_count": 67, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------------+--------+-------------+-----------------------+\n", - "| Column | Status | Type | Method |\n", - "+============+========+=============+=======================+\n", - "| first_name | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------------+\n", - "| address | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------------+\n", - "| city | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------------+\n", - "| postal | 1 | categorical | Tokenization |\n", - "+------------+--------+-------------+-----------------------+\n", - "| phone | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------------+\n", - "| email | 1 | categorical | Partial Masking |\n", - "+------------+--------+-------------+-----------------------+\n", - "| web | 1 | categorical | Synthetic Data |\n", - "+------------+--------+-------------+-----------------------+\n", - "| salary | 0 | numeric | |\n", - "+------------+--------+-------------+-----------------------+\n", - "| birthdate | 1 | datetime | Datetime Perturbation |\n", - "+------------+--------+-------------+-----------------------+\n", - "| age | 0 | numeric | |\n", - "+------------+--------+-------------+-----------------------+\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "Two numeric columns: '**salary**' and '**age**' are left" - ], - "metadata": { - "id": "5E4QdD_oq3hK" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Numeric Rounding " - ], - "metadata": { - "id": "m6YhI21CrDVE" - } - }, - { - "cell_type": "markdown", - "source": [ - "Applying `numeric_rounding` to '**salary**' column" - ], - "metadata": { - "id": "EHkjZIYOsNvt" - } - }, - { - "cell_type": "code", - "source": [ - "print(df.salary, '\\n') # original\n", - "anonym.numeric_rounding('salary', inplace = False) # see the changes" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "6rqrKURWrGAg", - "outputId": "b9b7a53f-73a3-4f13-99ba-6e07dd20242e" - }, - "execution_count": 68, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0 46391\n", - "1 30798\n", - "2 32384\n", - "3 39298\n", - "4 41630\n", - " ... \n", - "495 42239\n", - "496 42640\n", - "497 44982\n", - "498 32827\n", - "499 41266\n", - "Name: salary, Length: 500, dtype: int64 \n", - "\n" - ] - }, - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "0 50000\n", - "1 30000\n", - "2 30000\n", - "3 40000\n", - "4 40000\n", - " ... \n", - "495 40000\n", - "496 40000\n", - "497 40000\n", - "498 30000\n", - "499 40000\n", - "Name: salary, Length: 500, dtype: int64" - ] - }, - "metadata": {}, - "execution_count": 68 - } - ] - }, - { - "cell_type": "code", - "source": [ - "anonym.numeric_rounding('salary') # apply the changes" - ], - "metadata": { - "id": "XE1rXt4-r-I1" - }, - "execution_count": 69, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "# Numeric Perturbation (noise)" - ], - "metadata": { - "id": "jx4zer-nsGBr" - } - }, - { - "cell_type": "markdown", - "source": [ - " Some noise to '**age**' column would be nice" - ], - "metadata": { - "id": "P3SwYoXcs_a1" - } - }, - { - "cell_type": "code", - "source": [ - "print(df.age, '\\n') # original\n", - "\n", - "print(anonym.numeric_noise('age', inplace = False)) # observe the changes" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "PU5fy0wXsJx6", - "outputId": "acd07350-4d94-4555-aaa5-f8c9bd21b21a" - }, - "execution_count": 71, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "0 21\n", - "1 17\n", - "2 19\n", - "3 21\n", - "4 23\n", - " ..\n", - "495 26\n", - "496 22\n", - "497 23\n", - "498 27\n", - "499 21\n", - "Name: age, Length: 500, dtype: int64 \n", - "\n", - "0 28\n", - "1 17\n", - "2 25\n", - "3 26\n", - "4 17\n", - " ..\n", - "495 17\n", - "496 19\n", - "497 25\n", - "498 21\n", - "499 28\n", - "Length: 500, dtype: int64\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "anonym.numeric_noise('age') # apply the changes" - ], - "metadata": { - "id": "moJtlWantd4d" - }, - "execution_count": 72, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "anonym" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "g8Whzhwmtui2", - "outputId": "17bda6eb-0655-4be0-d8e7-4a369e3ddbd5" - }, - "execution_count": 73, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "+---------------------------------------+\n", - "| Total number of columns: 10 |\n", - "+=======================================+\n", - "| Anonymized Column -> Method: |\n", - "| - email -> Partial Masking |\n", - "| - first_name -> Synthetic Data |\n", - "| - address -> Synthetic Data |\n", - "| - city -> Synthetic Data |\n", - "| - web -> Synthetic Data |\n", - "| - phone -> Synthetic Data |\n", - "| - postal -> Tokenization |\n", - "| - birthdate -> Datetime Perturbation |\n", - "| - salary -> Generalization - Rounding |\n", - "| - age -> Numeric Perturbation |\n", - "+---------------------------------------+\n", - "| Unanonymized Columns: |\n", - "| |\n", - "+---------------------------------------+" - ] - }, - "metadata": {}, - "execution_count": 73 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "All columns have been successfully anonymized, let's now compare both datasets before and after anonymization " - ], - "metadata": { - "id": "Fx5pcH7YtxSU" - } - }, - { - "cell_type": "code", - "source": [ - "df.head()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 337 - }, - "id": "aUhWIBF9twM1", - "outputId": "1c1b74bc-753d-4347-c871-c1d492f2942d" - }, - "execution_count": 76, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
first_nameaddresscitypostalphoneemailwebsalarybirthdateage
0Aleshia14 Taylor StSt. Stephens WardCT2 7PP01835-703597atomkiewicz@hotmail.comhttp://www.alandrosenburgcpapc.co.uk463912000-12-23 15:09:18.11747520021
1Evan5 Binney StAbbey WardHP11 2AX01937-864715evan.zigomalas@gmail.comhttp://www.capgeminiamerica.co.uk307982004-04-22 04:09:51.32594880017
2France8 Moor PlaceEast Southbourne and Tuckton WBH6 3BE01347-368222france.andrade@hotmail.comhttp://www.elliottjohnwesq.co.uk323842002-01-21 18:56:29.09002560019
3Ulysses505 Exeter RdHawerby cum BeesbyDN36 5RP01912-771311ulysses@hotmail.comhttp://www.mcmahanbenl.co.uk392982000-11-24 21:59:48.62184000021
4Tyisha5396 Forth StreetGreets Green and Lyng WardB70 9DT01547-429341tyisha.veness@hotmail.comhttp://www.champagneroom.co.uk416301998-06-23 05:19:37.68700800023
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " first_name address ... birthdate age\n", - "0 Aleshia 14 Taylor St ... 2000-12-23 15:09:18.117475200 21\n", - "1 Evan 5 Binney St ... 2004-04-22 04:09:51.325948800 17\n", - "2 France 8 Moor Place ... 2002-01-21 18:56:29.090025600 19\n", - "3 Ulysses 505 Exeter Rd ... 2000-11-24 21:59:48.621840000 21\n", - "4 Tyisha 5396 Forth Street ... 1998-06-23 05:19:37.687008000 23\n", - "\n", - "[5 rows x 10 columns]" - ] - }, - "metadata": {}, - "execution_count": 76 - } - ] - }, - { - "cell_type": "code", - "source": [ - "anonym.to_df()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 867 - }, - "id": "A_qehCevuDJG", - "outputId": "cdb70418-89db-4bdc-c4ef-87c8326b406f" - }, - "execution_count": 77, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
first_nameaddresscitypostalphoneemailwebsalarybirthdateage
0Betty037 Anderson Branch Apt. 184\\nPowellfurt, AL 5...Annsideac9296b7cd+1-669-652-5362x575a*****z@hotmail.comhttp://wilson.com/500002001-02-21 15:09:18.11747520023
1Dustin32977 Amy Shores\\nNorth Jessicatown, WV 01498North Andrew3ed9f4641e737-362-3685e*****s@gmail.comhttp://meyer.com/300002004-02-26 04:09:51.3259488008
2Christopher7854 Nunez Cove\\nHoodside, SC 05482Hughessidecc1f446ef9+1-145-640-8718f*****e@hotmail.comhttp://cross.info/300002001-08-28 18:56:29.09002560015
3Samuel7583 Heather Prairie\\nScotthaven, AR 94542West Ashleybc69e5439a(719)299-8553x6456u*****s@hotmail.comhttps://www.anderson.com/400002000-12-25 21:59:48.62184000021
4Timothy929 Ellis Hills Apt. 766\\nEast Elizabeth, MO 2...Reesemouthf1e506bf00041.810.3076t*****s@hotmail.comhttps://www.shannon-thomas.net/400001999-03-24 05:19:37.68700800023
.................................
495Natalie54840 Tate Summit Suite 754\\nLake Amberbury, W...Port Jacobstadda31a1d471001-170-972-2673x3215a*****y@veit.co.ukhttps://www.moore.com/400001995-10-09 16:40:58.37931840020
496Jose847 Henson Pike Apt. 762\\nPort Nicholashaven, ...Peterport11f990c971127-355-3556x6363r*****i@euresti.co.ukhttp://perry.com/400001999-06-12 11:23:56.18820480017
497Joanna942 Derek Orchard\\nNew Dawntown, PA 88841Port Dannyborough33daacb57b+1-048-563-2513x3377c*****g@brenning.co.ukhttp://www.bradley.com/400001997-05-20 22:03:29.33133120032
498Karen50796 Karen Fall\\nSouth Whitney, NY 24702Brownview76fbf0ca46001-142-681-2195x7266c*****y@gmail.comhttps://www.carr.org/300001994-02-23 21:48:38.23741440027
499MelissaPSC 0277, Box 3447\\nAPO AP 93784Ramosside93cd3124ab(550)786-3496m*****i@hotmail.comhttps://levy.com/400002000-07-15 20:38:06.73969920017
\n", - "

500 rows × 10 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ], - "text/plain": [ - " first_name ... age\n", - "0 Betty ... 23\n", - "1 Dustin ... 8\n", - "2 Christopher ... 15\n", - "3 Samuel ... 21\n", - "4 Timothy ... 23\n", - ".. ... ... ..\n", - "495 Natalie ... 20\n", - "496 Jose ... 17\n", - "497 Joanna ... 32\n", - "498 Karen ... 27\n", - "499 Melissa ... 17\n", - "\n", - "[500 rows x 10 columns]" - ] - }, - "metadata": {}, - "execution_count": 77 - } - ] - } - ] -} \ No newline at end of file