diff --git a/examples/pandas.ipynb b/examples/pandas.ipynb
deleted file mode 100644
index 693c1ba..0000000
--- a/examples/pandas.ipynb
+++ /dev/null
@@ -1,1745 +0,0 @@
-{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "name": "examples",
- "provenance": [],
- "collapsed_sections": [
- "6YdsVXnwHgfc"
- ]
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
- }
- },
- "cells": [
- {
- "cell_type": "markdown",
- "source": [
- "# Installation"
- ],
- "metadata": {
- "id": "6YdsVXnwHgfc"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "!pip install cape-privacy==0.3.0 --no-deps"
- ],
- "metadata": {
- "id": "5hfF6zgl8MUL"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "!pip install anonympy==0.2.0"
- ],
- "metadata": {
- "id": "jo7icbdeoVFe"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "#Importing and Initializing"
- ],
- "metadata": {
- "id": "l8hVgsTQsFlL"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "from anonympy.pandas import dfAnonymizer\n",
- "from anonympy.pandas.utils import load_dataset"
- ],
- "metadata": {
- "id": "GcGQR3FW43iP"
- },
- "execution_count": 43,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "# specify datettime columns\n",
- "df = load_dataset('big')\n",
- "\n",
- "anonym = dfAnonymizer(df)\n",
- "\n",
- "anonym"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "C6MnzoxsBiSr",
- "outputId": "02f9b5c3-1344-45a1-9acc-3ec6bb6b010a"
- },
- "execution_count": 78,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "+-------------------------------+\n",
- "| Total number of columns: 10 |\n",
- "+===============================+\n",
- "| Anonymized Column -> Method: |\n",
- "+-------------------------------+\n",
- "| Unanonymized Columns: |\n",
- "| - first_name |\n",
- "| - address |\n",
- "| - city |\n",
- "| - postal |\n",
- "| - phone |\n",
- "| - email |\n",
- "| - web |\n",
- "| - salary |\n",
- "| - birthdate |\n",
- "| - age |\n",
- "+-------------------------------+"
- ]
- },
- "metadata": {},
- "execution_count": 78
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "
So, we have got 10 columns . None of them are yet anonymized
"
- ],
- "metadata": {
- "id": "9VuuD4VuCF9U"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Check dtypes"
- ],
- "metadata": {
- "id": "DdSuJuuLHoYl"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Since anonymization methods depend on data type, it's recommended to check each column data type before applying any functions
"
- ],
- "metadata": {
- "id": "DquwQyMXC8kC"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print('Rows:', df.shape[0],'\\nColumns:', df.shape[1])\n",
- "df.head()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 374
- },
- "id": "iy1MFIEzDVmH",
- "outputId": "c53f60a6-aaec-423d-affd-e12a2241792b"
- },
- "execution_count": 45,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Rows: 500 \n",
- "Columns: 10\n"
- ]
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " first_name \n",
- " address \n",
- " city \n",
- " postal \n",
- " phone \n",
- " email \n",
- " web \n",
- " salary \n",
- " birthdate \n",
- " age \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " Aleshia \n",
- " 14 Taylor St \n",
- " St. Stephens Ward \n",
- " CT2 7PP \n",
- " 01835-703597 \n",
- " atomkiewicz@hotmail.com \n",
- " http://www.alandrosenburgcpapc.co.uk \n",
- " 46391 \n",
- " 2000-12-23 15:09:18.117475200 \n",
- " 21 \n",
- " \n",
- " \n",
- " 1 \n",
- " Evan \n",
- " 5 Binney St \n",
- " Abbey Ward \n",
- " HP11 2AX \n",
- " 01937-864715 \n",
- " evan.zigomalas@gmail.com \n",
- " http://www.capgeminiamerica.co.uk \n",
- " 30798 \n",
- " 2004-04-22 04:09:51.325948800 \n",
- " 17 \n",
- " \n",
- " \n",
- " 2 \n",
- " France \n",
- " 8 Moor Place \n",
- " East Southbourne and Tuckton W \n",
- " BH6 3BE \n",
- " 01347-368222 \n",
- " france.andrade@hotmail.com \n",
- " http://www.elliottjohnwesq.co.uk \n",
- " 32384 \n",
- " 2002-01-21 18:56:29.090025600 \n",
- " 19 \n",
- " \n",
- " \n",
- " 3 \n",
- " Ulysses \n",
- " 505 Exeter Rd \n",
- " Hawerby cum Beesby \n",
- " DN36 5RP \n",
- " 01912-771311 \n",
- " ulysses@hotmail.com \n",
- " http://www.mcmahanbenl.co.uk \n",
- " 39298 \n",
- " 2000-11-24 21:59:48.621840000 \n",
- " 21 \n",
- " \n",
- " \n",
- " 4 \n",
- " Tyisha \n",
- " 5396 Forth Street \n",
- " Greets Green and Lyng Ward \n",
- " B70 9DT \n",
- " 01547-429341 \n",
- " tyisha.veness@hotmail.com \n",
- " http://www.champagneroom.co.uk \n",
- " 41630 \n",
- " 1998-06-23 05:19:37.687008000 \n",
- " 23 \n",
- " \n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n",
- " "
- ],
- "text/plain": [
- " first_name address ... birthdate age\n",
- "0 Aleshia 14 Taylor St ... 2000-12-23 15:09:18.117475200 21\n",
- "1 Evan 5 Binney St ... 2004-04-22 04:09:51.325948800 17\n",
- "2 France 8 Moor Place ... 2002-01-21 18:56:29.090025600 19\n",
- "3 Ulysses 505 Exeter Rd ... 2000-11-24 21:59:48.621840000 21\n",
- "4 Tyisha 5396 Forth Street ... 1998-06-23 05:19:37.687008000 23\n",
- "\n",
- "[5 rows x 10 columns]"
- ]
- },
- "metadata": {},
- "execution_count": 45
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "print('Categorical Columns: ', anonym.categorical_columns)\n",
- "print('Numeric Columns: ', anonym.numeric_columns)\n",
- "print('Datetime Columns: ', anonym.datetime_columns)\n",
- "\n",
- "print('\\nOr Call `info` method\\n ')\n",
- "\n",
- "anonym.info()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "IheXU6FtC19U",
- "outputId": "0dbd3eef-7621-48d1-b42c-e0227f82f6fb"
- },
- "execution_count": 46,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "Categorical Columns: ['first_name', 'address', 'city', 'postal', 'phone', 'email', 'web']\n",
- "Numeric Columns: ['salary', 'age']\n",
- "Datetime Columns: ['birthdate']\n",
- "\n",
- "Or Call `info` method\n",
- " \n",
- "+------------+--------+-------------+--------+\n",
- "| Column | Status | Type | Method |\n",
- "+============+========+=============+========+\n",
- "| first_name | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| address | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| city | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| postal | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| phone | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| email | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| web | 0 | categorical | |\n",
- "+------------+--------+-------------+--------+\n",
- "| salary | 0 | numeric | |\n",
- "+------------+--------+-------------+--------+\n",
- "| birthdate | 0 | datetime | |\n",
- "+------------+--------+-------------+--------+\n",
- "| age | 0 | numeric | |\n",
- "+------------+--------+-------------+--------+\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "7 categorical, 2 numerical and 1 datetime columns. \n",
- "Let's see what methods we can apply to categorical columns.\n",
- " List of available methods: anonympy.pandas.utils.available_methods
\n",
- "
"
- ],
- "metadata": {
- "id": "9jBf13NXD8RS"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "from anonympy.pandas.utils import available_methods"
- ],
- "metadata": {
- "id": "e1Yey5BGDuLG"
- },
- "execution_count": 47,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "available_methods('categorical') # args: 'categorical' / 'numerical' / 'datetime' / 'general' / None"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "t2gqFZdoEU9K",
- "outputId": "843b2a43-2d99-438b-8023-74e012e09dfd"
- },
- "execution_count": 48,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "categorical_fake\tcategorical_fake_auto\tcategorical_resampling\tcategorical_tokenization\tcategorical_email_masking\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Categorical Email Masking\n",
- "\n",
- "To apply partial email masking call categorical_email_masking
on corresponding column
"
- ],
- "metadata": {
- "id": "vIr7XFBoSoDh"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.categorical_email_masking(columns='email', inplace = False)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "Lxbx7cmbSquq",
- "outputId": "32075f7e-76d3-45e0-9865-124ba8bd147a"
- },
- "execution_count": 49,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "0 a*****z@hotmail.com\n",
- "1 e*****s@gmail.com\n",
- "2 f*****e@hotmail.com\n",
- "3 u*****s@hotmail.com\n",
- "4 t*****s@hotmail.com\n",
- " ... \n",
- "495 a*****y@veit.co.uk\n",
- "496 r*****i@euresti.co.uk\n",
- "497 c*****g@brenning.co.uk\n",
- "498 c*****y@gmail.com\n",
- "499 m*****i@hotmail.com\n",
- "Name: email, Length: 500, dtype: object"
- ]
- },
- "metadata": {},
- "execution_count": 49
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "`inplace=True` (default) - changes will be applied to the dataframe. Access using `anonym.to_df()`\n",
- " \n",
- "`inplace = False` - return the changes"
- ],
- "metadata": {
- "id": "ma_Bf0IATgyc"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.categorical_email_masking('email') # inplace = True\n",
- "\n",
- "print(anonym.anonymized_columns, '\\n')\n",
- "\n",
- "print(anonym)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "u--sEHaXTfMs",
- "outputId": "ed34ea67-c8a5-42ac-ec38-3944eb191851"
- },
- "execution_count": 50,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "['email'] \n",
- "\n",
- "+-------------------------------+\n",
- "| Total number of columns: 10 |\n",
- "+===============================+\n",
- "| Anonymized Column -> Method: |\n",
- "| - email -> Partial Masking |\n",
- "+-------------------------------+\n",
- "| Unanonymized Columns: |\n",
- "| - first_name |\n",
- "| - address |\n",
- "| - city |\n",
- "| - postal |\n",
- "| - phone |\n",
- "| - web |\n",
- "| - salary |\n",
- "| - birthdate |\n",
- "| - age |\n",
- "+-------------------------------+\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "Now **email** column appears to be anonymized"
- ],
- "metadata": {
- "id": "wYK1onhthhc-"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Categorical Synthetic data (fake data) \n",
- "Next, let's replace some values with synthetically generated ones. \n",
- " If column name is same as the corresponding faker's method, use categorical_fake_auto method.\n",
- " List of faker's methods: anonympy.pandas.utils.fake_methods
\n",
- "
"
- ],
- "metadata": {
- "id": "JCgS0Zr0F_ni"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "from anonympy.pandas.utils import fake_methods"
- ],
- "metadata": {
- "id": "5nn0FVkWEyd0"
- },
- "execution_count": 51,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "fake_methods() # args: letter / None/ 'all'\n",
- "# output in the following screenshot"
- ],
- "metadata": {
- "id": "0t36isKIHgd-",
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "outputId": "bd13d825-3d93-4ec6-890c-72fd239fb9b9"
- },
- "execution_count": 52,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "A | aba, address, administrative_unit, am_pm, android_platform_token, ascii_company_email, ascii_email, ascii_free_email, ascii_safe_email\n",
- "B | bank_country, bban, boolean, bothify, bs, building_number\n",
- "C | cache_pattern, catch_phrase, century, chrome, city, city_prefix, city_suffix, color, color_name, company, company_email, company_suffix, coordinate, country, country_calling_code, country_code, credit_card_expire, credit_card_full, credit_card_number, credit_card_provider, credit_card_security_code, cryptocurrency, cryptocurrency_code, cryptocurrency_name, csv, currency, currency_code, currency_name, currency_symbol, current_country, current_country_code\n",
- "D | date, date_between, date_between_dates, date_object, date_of_birth, date_this_century, date_this_decade, date_this_month, date_this_year, date_time, date_time_ad, date_time_between, date_time_between_dates, date_time_this_century, date_time_this_decade, date_time_this_month, date_time_this_year, day_of_month, day_of_week, del_arguments, dga, domain_name, domain_word, dsv\n",
- "E | ean, ean13, ean8, ein, email\n",
- "F | factories, file_extension, file_name, file_path, firefox, first_name, first_name_female, first_name_male, first_name_nonbinary, fixed_width, format, free_email, free_email_domain, future_date, future_datetime\n",
- "G | generator_attrs, get_arguments, get_formatter, get_providers\n",
- "H | hex_color, hexify, hostname, http_method\n",
- "I | iana_id, iban, image, image_url, internet_explorer, invalid_ssn, ios_platform_token, ipv4, ipv4_network_class, ipv4_private, ipv4_public, ipv6, isbn10, isbn13, iso8601, items, itin\n",
- "J | job, json\n",
- "L | language_code, language_name, last_name, last_name_female, last_name_male, last_name_nonbinary, latitude, latlng, lexify, license_plate, linux_platform_token, linux_processor, local_latlng, locale, locales, localized_ean, localized_ean13, localized_ean8, location_on_land, longitude\n",
- "M | mac_address, mac_platform_token, mac_processor, md5, military_apo, military_dpo, military_ship, military_state, mime_type, month, month_name, msisdn\n",
- "N | name, name_female, name_male, name_nonbinary, nic_handle, nic_handles, null_boolean, numerify\n",
- "O | opera\n",
- "P | paragraph, paragraphs, parse, password, past_date, past_datetime, phone_number, port_number, postalcode, postalcode_in_state, postalcode_plus4, postcode, postcode_in_state, prefix, prefix_female, prefix_male, prefix_nonbinary, pricetag, profile, provider, providers, psv, pybool, pydecimal, pydict, pyfloat, pyint, pyiterable, pylist, pyset, pystr, pystr_format, pystruct, pytimezone, pytuple\n",
- "R | random, random_choices, random_digit, random_digit_not_null, random_digit_not_null_or_empty, random_digit_or_empty, random_element, random_elements, random_int, random_letter, random_letters, random_lowercase_letter, random_number, random_sample, random_uppercase_letter, randomize_nb_elements, rgb_color, rgb_css_color, ripe_id\n",
- "S | safari, safe_color_name, safe_domain_name, safe_email, safe_hex_color, secondary_address, seed_instance, seed_locale, sentence, sentences, set_arguments, set_formatter, sha1, sha256, simple_profile, slug, ssn, state, state_abbr, street_address, street_name, street_suffix, suffix, suffix_female, suffix_male, suffix_nonbinary, swift, swift11, swift8\n",
- "T | tar, text, texts, time, time_delta, time_object, time_series, timezone, tld, tsv\n",
- "U | unique, unix_device, unix_partition, unix_time, upc_a, upc_e, uri, uri_extension, uri_page, uri_path, url, user_agent, user_name, uuid4\n",
- "W | weights, windows_platform_token, word, words\n",
- "Y | year\n",
- "Z | zipcode, zipcode_in_state, zipcode_plus4\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "As we can see from this list, column names: **address**, **city** and **first_name** correspond to faker's methods. So we can run `categorical_fake_auto` and it will anonymize these 3 columns\n",
- "* `email` method is also there, but since we already anonymized it, it won't get affected.\n",
- "\n",
- "![image.png]()"
- ],
- "metadata": {
- "id": "Efy7zl9xJjGh"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.categorical_fake_auto() # default args: locale='en_US', inplace=True\n",
- "\n",
- "# we could use `categorical_fake` method as well, with specifiying the columns\n",
- "# anonym.categorical_fake(['first_name', 'address', 'city'])\n",
- "\n",
- "print(anonym)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "1h4uErHyJoCZ",
- "outputId": "c83dd408-c618-478f-8a6b-2193df46afba"
- },
- "execution_count": 53,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "`email` column already anonymized!\n",
- "+--------------------------------+\n",
- "| Total number of columns: 10 |\n",
- "+================================+\n",
- "| Anonymized Column -> Method: |\n",
- "| - email -> Partial Masking |\n",
- "| - first_name -> Synthetic Data |\n",
- "| - address -> Synthetic Data |\n",
- "| - city -> Synthetic Data |\n",
- "+--------------------------------+\n",
- "| Unanonymized Columns: |\n",
- "| - postal |\n",
- "| - phone |\n",
- "| - web |\n",
- "| - salary |\n",
- "| - birthdate |\n",
- "| - age |\n",
- "+--------------------------------+\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "Let's call `fake_methods` for letter 'u'"
- ],
- "metadata": {
- "id": "ezXj21Kwj5xk"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "fake_methods('u')"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "g-fVpOSVj5Or",
- "outputId": "fac57759-8a09-4b5b-bf48-9e2063f9f70c"
- },
- "execution_count": 54,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- " unique, unix_device, unix_partition, unix_time, upc_a, upc_e, uri, uri_extension, uri_page, uri_path, url, user_agent, user_name, uuid4\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "In our dataframe '**web**' column column stored different **urls**. So we can use `url` method on '**web**' column. \n",
- "\n",
- "And column '**phone**' values can be replaced using `phone_number` method\n",
- "\n",
- "Let's apply these changes using `categorical_fake` method which accepts list of column names (if similar to method name) or a dictionary (when names differ)"
- ],
- "metadata": {
- "id": "1GSUKASskUMN"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.categorical_fake({'web':'url', 'phone': 'phone_number'}) # inplace = True\n",
- "\n",
- "anonym.info()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "9xOiIlg0koNv",
- "outputId": "d52a595e-d196-47e6-b282-f8113139b1b6"
- },
- "execution_count": 55,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "+------------+--------+-------------+-----------------+\n",
- "| Column | Status | Type | Method |\n",
- "+============+========+=============+=================+\n",
- "| first_name | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| address | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| city | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| postal | 0 | categorical | |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| phone | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| email | 1 | categorical | Partial Masking |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| web | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| salary | 0 | numeric | |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| birthdate | 0 | datetime | |\n",
- "+------------+--------+-------------+-----------------+\n",
- "| age | 0 | numeric | |\n",
- "+------------+--------+-------------+-----------------+\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Categorical Tokenization\n"
- ],
- "metadata": {
- "id": "NKODUxOmn1fk"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Let's apply tokenization on '**postal**' column"
- ],
- "metadata": {
- "id": "md6pqB-boElC"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(anonym.categorical_tokenization('postal', inplace = False)) # first inplace = False, to see how the changes will look\n",
- "\n",
- "anonym.categorical_tokenization('postal') # inplace = True, by default"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "VhHUGjftoA-U",
- "outputId": "eaf39bdd-562a-41a8-aaa7-d5398e3b7e7b"
- },
- "execution_count": 56,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "0 2a8ba32381\n",
- "1 1569763104\n",
- "2 73c0b907be\n",
- "3 b155a02362\n",
- "4 56a6b1e03a\n",
- " ... \n",
- "495 7687741a49\n",
- "496 6a16bbf755\n",
- "497 bedd62bb14\n",
- "498 30dae3be6e\n",
- "499 7c49b8248b\n",
- "Name: postal, Length: 500, dtype: object\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# which columns are left unanonymized? \n",
- "print(anonym.unanonymized_columns)"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "rO1SI4H2pXyr",
- "outputId": "0eeb5fe6-d51e-4b22-ae6a-e8a531c50097"
- },
- "execution_count": 58,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "['salary', 'birthdate', 'age']\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Datetime Perturbation (noise)"
- ],
- "metadata": {
- "id": "x301iHA7peji"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Let's add some random noise to '**birthdate**' column\n",
- " Also, we should specify to add noise only to days and month but not year."
- ],
- "metadata": {
- "id": "NWLhBATkpk20"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(anonym.datetime_noise('birthdate', frequency=('MONTH', 'DAY'), inplace = False), '\\n') # inplace = False to observe the changes\n",
- "\n",
- "print(df.birthdate) # for comparison"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "mcS3tL6kpgd3",
- "outputId": "2be407a8-2f95-4d57-feb0-85ee8f3702ce"
- },
- "execution_count": 65,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "0 2000-03-30 15:09:18.117475200\n",
- "1 2004-05-18 04:09:51.325948800\n",
- "2 2002-07-15 18:56:29.090025600\n",
- "3 2000-11-19 21:59:48.621840000\n",
- "4 1998-05-20 05:19:37.687008000\n",
- " ... \n",
- "495 1994-09-15 16:40:58.379318400\n",
- "496 1998-11-08 11:23:56.188204800\n",
- "497 1998-06-12 22:03:29.331331200\n",
- "498 1995-02-15 21:48:38.237414400\n",
- "499 2000-10-12 20:38:06.739699200\n",
- "Name: birthdate, Length: 500, dtype: datetime64[ns] \n",
- "\n",
- "0 2000-12-23 15:09:18.117475200\n",
- "1 2004-04-22 04:09:51.325948800\n",
- "2 2002-01-21 18:56:29.090025600\n",
- "3 2000-11-24 21:59:48.621840000\n",
- "4 1998-06-23 05:19:37.687008000\n",
- " ... \n",
- "495 1995-06-08 16:40:58.379318400\n",
- "496 1999-02-10 11:23:56.188204800\n",
- "497 1998-01-13 22:03:29.331331200\n",
- "498 1994-12-20 21:48:38.237414400\n",
- "499 2000-02-13 20:38:06.739699200\n",
- "Name: birthdate, Length: 500, dtype: datetime64[ns]\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.datetime_noise('birthdate', frequency=('MONTH', 'DAY')) # inplace=True, to apply the changes"
- ],
- "metadata": {
- "id": "rKmHk_aVqv3f"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.info()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "mWsbXFSCq1FY",
- "outputId": "63581894-6f72-4517-f1b0-29398f892db7"
- },
- "execution_count": 67,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "+------------+--------+-------------+-----------------------+\n",
- "| Column | Status | Type | Method |\n",
- "+============+========+=============+=======================+\n",
- "| first_name | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| address | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| city | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| postal | 1 | categorical | Tokenization |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| phone | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| email | 1 | categorical | Partial Masking |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| web | 1 | categorical | Synthetic Data |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| salary | 0 | numeric | |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| birthdate | 1 | datetime | Datetime Perturbation |\n",
- "+------------+--------+-------------+-----------------------+\n",
- "| age | 0 | numeric | |\n",
- "+------------+--------+-------------+-----------------------+\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "Two numeric columns: '**salary**' and '**age**' are left"
- ],
- "metadata": {
- "id": "5E4QdD_oq3hK"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Numeric Rounding "
- ],
- "metadata": {
- "id": "m6YhI21CrDVE"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "Applying `numeric_rounding` to '**salary**' column"
- ],
- "metadata": {
- "id": "EHkjZIYOsNvt"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(df.salary, '\\n') # original\n",
- "anonym.numeric_rounding('salary', inplace = False) # see the changes"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "6rqrKURWrGAg",
- "outputId": "b9b7a53f-73a3-4f13-99ba-6e07dd20242e"
- },
- "execution_count": 68,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "0 46391\n",
- "1 30798\n",
- "2 32384\n",
- "3 39298\n",
- "4 41630\n",
- " ... \n",
- "495 42239\n",
- "496 42640\n",
- "497 44982\n",
- "498 32827\n",
- "499 41266\n",
- "Name: salary, Length: 500, dtype: int64 \n",
- "\n"
- ]
- },
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "0 50000\n",
- "1 30000\n",
- "2 30000\n",
- "3 40000\n",
- "4 40000\n",
- " ... \n",
- "495 40000\n",
- "496 40000\n",
- "497 40000\n",
- "498 30000\n",
- "499 40000\n",
- "Name: salary, Length: 500, dtype: int64"
- ]
- },
- "metadata": {},
- "execution_count": 68
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.numeric_rounding('salary') # apply the changes"
- ],
- "metadata": {
- "id": "XE1rXt4-r-I1"
- },
- "execution_count": 69,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Numeric Perturbation (noise)"
- ],
- "metadata": {
- "id": "jx4zer-nsGBr"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- " Some noise to '**age**' column would be nice"
- ],
- "metadata": {
- "id": "P3SwYoXcs_a1"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(df.age, '\\n') # original\n",
- "\n",
- "print(anonym.numeric_noise('age', inplace = False)) # observe the changes"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "PU5fy0wXsJx6",
- "outputId": "acd07350-4d94-4555-aaa5-f8c9bd21b21a"
- },
- "execution_count": 71,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "0 21\n",
- "1 17\n",
- "2 19\n",
- "3 21\n",
- "4 23\n",
- " ..\n",
- "495 26\n",
- "496 22\n",
- "497 23\n",
- "498 27\n",
- "499 21\n",
- "Name: age, Length: 500, dtype: int64 \n",
- "\n",
- "0 28\n",
- "1 17\n",
- "2 25\n",
- "3 26\n",
- "4 17\n",
- " ..\n",
- "495 17\n",
- "496 19\n",
- "497 25\n",
- "498 21\n",
- "499 28\n",
- "Length: 500, dtype: int64\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.numeric_noise('age') # apply the changes"
- ],
- "metadata": {
- "id": "moJtlWantd4d"
- },
- "execution_count": 72,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "anonym"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "g8Whzhwmtui2",
- "outputId": "17bda6eb-0655-4be0-d8e7-4a369e3ddbd5"
- },
- "execution_count": 73,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "+---------------------------------------+\n",
- "| Total number of columns: 10 |\n",
- "+=======================================+\n",
- "| Anonymized Column -> Method: |\n",
- "| - email -> Partial Masking |\n",
- "| - first_name -> Synthetic Data |\n",
- "| - address -> Synthetic Data |\n",
- "| - city -> Synthetic Data |\n",
- "| - web -> Synthetic Data |\n",
- "| - phone -> Synthetic Data |\n",
- "| - postal -> Tokenization |\n",
- "| - birthdate -> Datetime Perturbation |\n",
- "| - salary -> Generalization - Rounding |\n",
- "| - age -> Numeric Perturbation |\n",
- "+---------------------------------------+\n",
- "| Unanonymized Columns: |\n",
- "| |\n",
- "+---------------------------------------+"
- ]
- },
- "metadata": {},
- "execution_count": 73
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "All columns have been successfully anonymized, let's now compare both datasets before and after anonymization "
- ],
- "metadata": {
- "id": "Fx5pcH7YtxSU"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df.head()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 337
- },
- "id": "aUhWIBF9twM1",
- "outputId": "1c1b74bc-753d-4347-c871-c1d492f2942d"
- },
- "execution_count": 76,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " first_name \n",
- " address \n",
- " city \n",
- " postal \n",
- " phone \n",
- " email \n",
- " web \n",
- " salary \n",
- " birthdate \n",
- " age \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " Aleshia \n",
- " 14 Taylor St \n",
- " St. Stephens Ward \n",
- " CT2 7PP \n",
- " 01835-703597 \n",
- " atomkiewicz@hotmail.com \n",
- " http://www.alandrosenburgcpapc.co.uk \n",
- " 46391 \n",
- " 2000-12-23 15:09:18.117475200 \n",
- " 21 \n",
- " \n",
- " \n",
- " 1 \n",
- " Evan \n",
- " 5 Binney St \n",
- " Abbey Ward \n",
- " HP11 2AX \n",
- " 01937-864715 \n",
- " evan.zigomalas@gmail.com \n",
- " http://www.capgeminiamerica.co.uk \n",
- " 30798 \n",
- " 2004-04-22 04:09:51.325948800 \n",
- " 17 \n",
- " \n",
- " \n",
- " 2 \n",
- " France \n",
- " 8 Moor Place \n",
- " East Southbourne and Tuckton W \n",
- " BH6 3BE \n",
- " 01347-368222 \n",
- " france.andrade@hotmail.com \n",
- " http://www.elliottjohnwesq.co.uk \n",
- " 32384 \n",
- " 2002-01-21 18:56:29.090025600 \n",
- " 19 \n",
- " \n",
- " \n",
- " 3 \n",
- " Ulysses \n",
- " 505 Exeter Rd \n",
- " Hawerby cum Beesby \n",
- " DN36 5RP \n",
- " 01912-771311 \n",
- " ulysses@hotmail.com \n",
- " http://www.mcmahanbenl.co.uk \n",
- " 39298 \n",
- " 2000-11-24 21:59:48.621840000 \n",
- " 21 \n",
- " \n",
- " \n",
- " 4 \n",
- " Tyisha \n",
- " 5396 Forth Street \n",
- " Greets Green and Lyng Ward \n",
- " B70 9DT \n",
- " 01547-429341 \n",
- " tyisha.veness@hotmail.com \n",
- " http://www.champagneroom.co.uk \n",
- " 41630 \n",
- " 1998-06-23 05:19:37.687008000 \n",
- " 23 \n",
- " \n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n",
- " "
- ],
- "text/plain": [
- " first_name address ... birthdate age\n",
- "0 Aleshia 14 Taylor St ... 2000-12-23 15:09:18.117475200 21\n",
- "1 Evan 5 Binney St ... 2004-04-22 04:09:51.325948800 17\n",
- "2 France 8 Moor Place ... 2002-01-21 18:56:29.090025600 19\n",
- "3 Ulysses 505 Exeter Rd ... 2000-11-24 21:59:48.621840000 21\n",
- "4 Tyisha 5396 Forth Street ... 1998-06-23 05:19:37.687008000 23\n",
- "\n",
- "[5 rows x 10 columns]"
- ]
- },
- "metadata": {},
- "execution_count": 76
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "anonym.to_df()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 867
- },
- "id": "A_qehCevuDJG",
- "outputId": "cdb70418-89db-4bdc-c4ef-87c8326b406f"
- },
- "execution_count": 77,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " first_name \n",
- " address \n",
- " city \n",
- " postal \n",
- " phone \n",
- " email \n",
- " web \n",
- " salary \n",
- " birthdate \n",
- " age \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " 0 \n",
- " Betty \n",
- " 037 Anderson Branch Apt. 184\\nPowellfurt, AL 5... \n",
- " Annside \n",
- " ac9296b7cd \n",
- " +1-669-652-5362x575 \n",
- " a*****z@hotmail.com \n",
- " http://wilson.com/ \n",
- " 50000 \n",
- " 2001-02-21 15:09:18.117475200 \n",
- " 23 \n",
- " \n",
- " \n",
- " 1 \n",
- " Dustin \n",
- " 32977 Amy Shores\\nNorth Jessicatown, WV 01498 \n",
- " North Andrew \n",
- " 3ed9f4641e \n",
- " 737-362-3685 \n",
- " e*****s@gmail.com \n",
- " http://meyer.com/ \n",
- " 30000 \n",
- " 2004-02-26 04:09:51.325948800 \n",
- " 8 \n",
- " \n",
- " \n",
- " 2 \n",
- " Christopher \n",
- " 7854 Nunez Cove\\nHoodside, SC 05482 \n",
- " Hughesside \n",
- " cc1f446ef9 \n",
- " +1-145-640-8718 \n",
- " f*****e@hotmail.com \n",
- " http://cross.info/ \n",
- " 30000 \n",
- " 2001-08-28 18:56:29.090025600 \n",
- " 15 \n",
- " \n",
- " \n",
- " 3 \n",
- " Samuel \n",
- " 7583 Heather Prairie\\nScotthaven, AR 94542 \n",
- " West Ashley \n",
- " bc69e5439a \n",
- " (719)299-8553x6456 \n",
- " u*****s@hotmail.com \n",
- " https://www.anderson.com/ \n",
- " 40000 \n",
- " 2000-12-25 21:59:48.621840000 \n",
- " 21 \n",
- " \n",
- " \n",
- " 4 \n",
- " Timothy \n",
- " 929 Ellis Hills Apt. 766\\nEast Elizabeth, MO 2... \n",
- " Reesemouth \n",
- " f1e506bf00 \n",
- " 041.810.3076 \n",
- " t*****s@hotmail.com \n",
- " https://www.shannon-thomas.net/ \n",
- " 40000 \n",
- " 1999-03-24 05:19:37.687008000 \n",
- " 23 \n",
- " \n",
- " \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " ... \n",
- " \n",
- " \n",
- " 495 \n",
- " Natalie \n",
- " 54840 Tate Summit Suite 754\\nLake Amberbury, W... \n",
- " Port Jacobstad \n",
- " da31a1d471 \n",
- " 001-170-972-2673x3215 \n",
- " a*****y@veit.co.uk \n",
- " https://www.moore.com/ \n",
- " 40000 \n",
- " 1995-10-09 16:40:58.379318400 \n",
- " 20 \n",
- " \n",
- " \n",
- " 496 \n",
- " Jose \n",
- " 847 Henson Pike Apt. 762\\nPort Nicholashaven, ... \n",
- " Peterport \n",
- " 11f990c971 \n",
- " 127-355-3556x6363 \n",
- " r*****i@euresti.co.uk \n",
- " http://perry.com/ \n",
- " 40000 \n",
- " 1999-06-12 11:23:56.188204800 \n",
- " 17 \n",
- " \n",
- " \n",
- " 497 \n",
- " Joanna \n",
- " 942 Derek Orchard\\nNew Dawntown, PA 88841 \n",
- " Port Dannyborough \n",
- " 33daacb57b \n",
- " +1-048-563-2513x3377 \n",
- " c*****g@brenning.co.uk \n",
- " http://www.bradley.com/ \n",
- " 40000 \n",
- " 1997-05-20 22:03:29.331331200 \n",
- " 32 \n",
- " \n",
- " \n",
- " 498 \n",
- " Karen \n",
- " 50796 Karen Fall\\nSouth Whitney, NY 24702 \n",
- " Brownview \n",
- " 76fbf0ca46 \n",
- " 001-142-681-2195x7266 \n",
- " c*****y@gmail.com \n",
- " https://www.carr.org/ \n",
- " 30000 \n",
- " 1994-02-23 21:48:38.237414400 \n",
- " 27 \n",
- " \n",
- " \n",
- " 499 \n",
- " Melissa \n",
- " PSC 0277, Box 3447\\nAPO AP 93784 \n",
- " Ramosside \n",
- " 93cd3124ab \n",
- " (550)786-3496 \n",
- " m*****i@hotmail.com \n",
- " https://levy.com/ \n",
- " 40000 \n",
- " 2000-07-15 20:38:06.739699200 \n",
- " 17 \n",
- " \n",
- " \n",
- "
\n",
- "
500 rows × 10 columns
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n",
- " "
- ],
- "text/plain": [
- " first_name ... age\n",
- "0 Betty ... 23\n",
- "1 Dustin ... 8\n",
- "2 Christopher ... 15\n",
- "3 Samuel ... 21\n",
- "4 Timothy ... 23\n",
- ".. ... ... ..\n",
- "495 Natalie ... 20\n",
- "496 Jose ... 17\n",
- "497 Joanna ... 32\n",
- "498 Karen ... 27\n",
- "499 Melissa ... 17\n",
- "\n",
- "[500 rows x 10 columns]"
- ]
- },
- "metadata": {},
- "execution_count": 77
- }
- ]
- }
- ]
-}
\ No newline at end of file