From 591dc2936da6d7dd0f8c2290af8ddd6609b45644 Mon Sep 17 00:00:00 2001 From: silil Date: Sun, 30 Jun 2019 22:08:47 +0100 Subject: [PATCH 1/3] minor changes --- .../record-linkage/RecordLinkage.ipynb | 2615 ++++++++++++++++- 1 file changed, 2467 insertions(+), 148 deletions(-) diff --git a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb index da5b6a18..5af506bf 100644 --- a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb +++ b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb @@ -12,7 +12,7 @@ "\n", "## Introduction\n", "\n", - "When you combine information about from multiple sources, you have to determine whether two individuals in two separate datasets are the same. You also might have multiple individuals with the same name in one dataset and need to decide whether to treat them as the same person or not. This has important implications for your analysis. Record linkage also goes by the terms data matching, merge/purge, duplication detection, de-duping, reference matching, co-reference/anaphora in various fields. \n", + "When you combine information from multiple sources, you have to determine whether two individuals in two separate datasets are the same. You also might have multiple individuals with the same name in one dataset and need to decide whether to treat them as the same person or not. This has important implications for your analysis. Record linkage also goes by the terms data matching, merge/purge, duplication detection, de-duping, reference matching, co-reference/anaphora in various fields. \n", "\n", "There are several approaches to record linkage that include **exact matching** (for example, joining records based on social security number), **rule-based linking** (applying a hierarchical set of rules that reflect domain knowledge; for example, if two people have the same first and last name and the same birthday they are considered the same); and **probabilistic linking**, or estimating the likelihood that two entities are the same and then deciding on a threshold above which two individuals will be considered to be the same. \n", "\n", @@ -42,9 +42,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:39.182112Z", + "start_time": "2019-06-30T20:28:37.683186Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Populating the interactive namespace from numpy and matplotlib\n" + ] + } + ], "source": [ "%pylab inline\n", "from __future__ import print_function\n", @@ -73,8 +86,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:42.776555Z", + "start_time": "2019-06-30T20:28:42.093983Z" + } + }, "outputs": [], "source": [ "# Read in NSF awards data\n", @@ -83,9 +101,195 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:43.497302Z", + "start_time": "2019-06-30T20:28:43.419318Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AwardIdFirstNameLastNameStartDateEndDateAwardTitleAwardEffectiveDateAwardExpirationDateNameCityNameZipCodePhoneNumberStreetAddressCountryNameStateNameStateCode
0415302JeffreyKuhn2010-01-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
1415302RobertRosner2010-01-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
2415302PhilipGoode2010-01-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
3415302ThomasRimmele2012-03-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
4415302StephenKeil2010-01-152012-03-15Advanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
\n", + "
" + ], + "text/plain": [ + " AwardId FirstName LastName StartDate EndDate \\\n", + "0 415302 Jeffrey Kuhn 2010-01-15 NaN \n", + "1 415302 Robert Rosner 2010-01-15 NaN \n", + "2 415302 Philip Goode 2010-01-15 NaN \n", + "3 415302 Thomas Rimmele 2012-03-15 NaN \n", + "4 415302 Stephen Keil 2010-01-15 2012-03-15 \n", + "\n", + " AwardTitle AwardEffectiveDate \\\n", + "0 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", + "1 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", + "2 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", + "3 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", + "4 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", + "\n", + " AwardExpirationDate Name \\\n", + "0 2015-09-30 Association of Universities for Research in As... \n", + "1 2015-09-30 Association of Universities for Research in As... \n", + "2 2015-09-30 Association of Universities for Research in As... \n", + "3 2015-09-30 Association of Universities for Research in As... \n", + "4 2015-09-30 Association of Universities for Research in As... \n", + "\n", + " CityName ZipCode PhoneNumber StreetAddress \\\n", + "0 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", + "1 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", + "2 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", + "3 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", + "4 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", + "\n", + " CountryName StateName StateCode \n", + "0 United States District of Columbia DC \n", + "1 United States District of Columbia DC \n", + "2 United States District of Columbia DC \n", + "3 United States District of Columbia DC \n", + "4 United States District of Columbia DC " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Take a first look at the data\n", "df_nsf_awards.head()" @@ -100,8 +304,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:48.594425Z", + "start_time": "2019-06-30T20:28:47.255452Z" + } + }, "outputs": [], "source": [ "# Read in UC data\n", @@ -110,9 +319,138 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:48.640269Z", + "start_time": "2019-06-30T20:28:48.599594Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDyearcampusnametitlegrossbaseovertimeextraexclude
017519712011BERKELEY***********TUTOR - NON-GSHIP0.490.00.000
117589842011BERKELEY***********TUTOR - NON-GSHIP0.490.00.000
218215852011IRVINE***********TUTOR - NON-GSHIP0.510.00.000
319668462011SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC III0.620.00.000
417589472011BERKELEY***********READER - NON-GSHIP0.730.00.000
\n", + "
" + ], + "text/plain": [ + " ID year campus name title \\\n", + "0 1751971 2011 BERKELEY *********** TUTOR - NON-GSHIP \n", + "1 1758984 2011 BERKELEY *********** TUTOR - NON-GSHIP \n", + "2 1821585 2011 IRVINE *********** TUTOR - NON-GSHIP \n", + "3 1966846 2011 SAN FRANCISCO MACKEWICZ , CARL E STAFF RESEARCH ASSOC III \n", + "4 1758947 2011 BERKELEY *********** READER - NON-GSHIP \n", + "\n", + " gross base overtime extra exclude \n", + "0 0.49 0.0 0.0 0 0 \n", + "1 0.49 0.0 0.0 0 0 \n", + "2 0.51 0.0 0.0 0 0 \n", + "3 0.62 0.0 0.0 0 0 \n", + "4 0.73 0.0 0.0 0 0 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Look at what the UC data contains\n", "df_ucpay.head()" @@ -142,9 +480,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:52.265775Z", + "start_time": "2019-06-30T20:28:52.251769Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([2011])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all unique entries in the 'year' column\n", "df_ucpay.year.unique()" @@ -159,9 +513,27 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:55.133262Z", + "start_time": "2019-06-30T20:28:55.077549Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['BERKELEY', 'IRVINE', 'SAN FRANCISCO', 'LOS ANGELES', 'DANR',\n", + " 'SANTA BARBARA', 'SANTA CRUZ', 'RIVERSIDE', 'DAVIS', 'MERCED',\n", + " 'SAN DIEGO', 'UCOP'], dtype=object)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all unique entries in the 'campus' column\n", "df_ucpay.campus.unique()" @@ -176,19 +548,56 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:59.381824Z", + "start_time": "2019-06-30T20:28:58.797922Z" + } + }, + "outputs": [ + { + "ename": "AttributeError", + "evalue": "'Series' object has no attribute 'sorted_values'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Look at number of entries by campus in the dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_ucpay\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'campus'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msorted_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'barh'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/.pyenv/versions/3.5.2/envs/python_3/lib/python3.5/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 4374\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4375\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4376\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4377\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4378\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'sorted_values'" + ] + } + ], "source": [ "# Look at number of entries by campus in the dataset\n", - "df_ucpay.groupby('campus').size().plot(kind='barh')" + "df_ucpay.groupby('campus').size().sorted_values(ascending=False).plot(kind='barh')" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:28:59.522668Z", + "start_time": "2019-06-30T20:28:59.452969Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['TUTOR - NON-GSHIP', 'STAFF RESEARCH ASSOC III',\n", + " 'READER - NON-GSHIP', ..., 'ATHLETICS MANAGER 4',\n", + " 'CHIEF EXEC OFFICER - MED CENTR', 'TREASURER OF THE REGENTS'],\n", + " dtype=object)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get all unique titles\n", "df_ucpay.title.unique()" @@ -196,9 +605,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:00.564528Z", + "start_time": "2019-06-30T20:29:00.485584Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "2626" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Find out how many unique titles are present in the data\n", "len(df_ucpay.title.unique())" @@ -213,9 +638,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:03.109872Z", + "start_time": "2019-06-30T20:29:03.099584Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(259043, 10)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Get number of rows and columns of UC dataset\n", "df_ucpay.shape" @@ -232,9 +673,25 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:05.748041Z", + "start_time": "2019-06-30T20:29:05.626151Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(163429, 10)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Use a mask to keep only entries that do NOT have stars instead of a name\n", "mask = df_ucpay.name != \"***********\" \n", @@ -252,8 +709,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:08.142006Z", + "start_time": "2019-06-30T20:29:08.100261Z" + } + }, "outputs": [], "source": [ "# Save df_ucpay with only named entries\n", @@ -262,9 +724,288 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:08.923390Z", + "start_time": "2019-06-30T20:29:08.825038Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDyearcampusnametitlegrossbaseovertimeextraexclude
319668462011SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC III0.620.000.000
618703902011LOS ANGELESESCUJURI , ERIC JOSEPHTECHNICIAN, SCENE, SR1.140.000.000
717719362011BERKELEYJUNG , WOO YONGPOSTDOC-EMPLOYEE1.281.280.000
818921222011LOS ANGELESSHAPIRO , JORDAN ISAACTECHNICIAN, SCENE1.330.000.000
1219883592011SANTA BARBARACUTLER , CHARLES IANLABORATORY ASST I1.550.000.000
1518618692011LOS ANGELESANDERSON , MARK CALVINTECHNICIAN, SCENE, SR1.830.000.000
2518871912011LOS ANGELESPATEL , DEV KAPILASSISTANT IV2.192.190.000
2618754262011LOS ANGELESHILDER , JAMIE LPOSTDOC-EMPLOYEE2.202.200.000
4619772092011SAN FRANCISCOWU , YALEIPOSTDOC-EMPLOYEE2.842.840.000
7318958652011LOS ANGELESVALERIO , STEVEN GERARD,JRTECHNICIAN, SCENE4.240.000.000
7419546032011SAN FRANCISCOBAILEY , GREGORY D____ASSISTANT, HOSPITAL, II4.240.000.000
8119308962011SAN DIEGOCRESPO , NOE CUAUHTEMOCPOSTDOC-EMPLOYEE4.494.490.000
8318129402011DAVISRENO , BROOKEASSISTANT III4.634.630.000
9119449912011SAN DIEGORUBIO DE LA TOR , ELENAPOSTDOC-EMPLOYEE5.195.190.000
9418153382011DAVISSMITH , KEN APROGRAMMER VII - SUPV5.290.000.000
\n", + "
" + ], + "text/plain": [ + " ID year campus name \\\n", + "3 1966846 2011 SAN FRANCISCO MACKEWICZ , CARL E \n", + "6 1870390 2011 LOS ANGELES ESCUJURI , ERIC JOSEPH \n", + "7 1771936 2011 BERKELEY JUNG , WOO YONG \n", + "8 1892122 2011 LOS ANGELES SHAPIRO , JORDAN ISAAC \n", + "12 1988359 2011 SANTA BARBARA CUTLER , CHARLES IAN \n", + "15 1861869 2011 LOS ANGELES ANDERSON , MARK CALVIN \n", + "25 1887191 2011 LOS ANGELES PATEL , DEV KAPIL \n", + "26 1875426 2011 LOS ANGELES HILDER , JAMIE L \n", + "46 1977209 2011 SAN FRANCISCO WU , YALEI \n", + "73 1895865 2011 LOS ANGELES VALERIO , STEVEN GERARD,JR \n", + "74 1954603 2011 SAN FRANCISCO BAILEY , GREGORY D \n", + "81 1930896 2011 SAN DIEGO CRESPO , NOE CUAUHTEMOC \n", + "83 1812940 2011 DAVIS RENO , BROOKE \n", + "91 1944991 2011 SAN DIEGO RUBIO DE LA TOR , ELENA \n", + "94 1815338 2011 DAVIS SMITH , KEN A \n", + "\n", + " title gross base overtime extra exclude \n", + "3 STAFF RESEARCH ASSOC III 0.62 0.00 0.0 0 0 \n", + "6 TECHNICIAN, SCENE, SR 1.14 0.00 0.0 0 0 \n", + "7 POSTDOC-EMPLOYEE 1.28 1.28 0.0 0 0 \n", + "8 TECHNICIAN, SCENE 1.33 0.00 0.0 0 0 \n", + "12 LABORATORY ASST I 1.55 0.00 0.0 0 0 \n", + "15 TECHNICIAN, SCENE, SR 1.83 0.00 0.0 0 0 \n", + "25 ASSISTANT IV 2.19 2.19 0.0 0 0 \n", + "26 POSTDOC-EMPLOYEE 2.20 2.20 0.0 0 0 \n", + "46 POSTDOC-EMPLOYEE 2.84 2.84 0.0 0 0 \n", + "73 TECHNICIAN, SCENE 4.24 0.00 0.0 0 0 \n", + "74 ____ASSISTANT, HOSPITAL, II 4.24 0.00 0.0 0 0 \n", + "81 POSTDOC-EMPLOYEE 4.49 4.49 0.0 0 0 \n", + "83 ASSISTANT III 4.63 4.63 0.0 0 0 \n", + "91 POSTDOC-EMPLOYEE 5.19 5.19 0.0 0 0 \n", + "94 PROGRAMMER VII - SUPV 5.29 0.00 0.0 0 0 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Look at the first 15 entries in the updated dataset with redacted names removed\n", "df_ucpay.head(15)" @@ -281,8 +1022,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:12.209908Z", + "start_time": "2019-06-30T20:29:12.183091Z" + } + }, "outputs": [], "source": [ "# List of columns to keep \n", @@ -294,9 +1040,95 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:12.859866Z", + "start_time": "2019-06-30T20:29:12.818769Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDcampusnametitle
31966846SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC III
61870390LOS ANGELESESCUJURI , ERIC JOSEPHTECHNICIAN, SCENE, SR
71771936BERKELEYJUNG , WOO YONGPOSTDOC-EMPLOYEE
81892122LOS ANGELESSHAPIRO , JORDAN ISAACTECHNICIAN, SCENE
121988359SANTA BARBARACUTLER , CHARLES IANLABORATORY ASST I
\n", + "
" + ], + "text/plain": [ + " ID campus name title\n", + "3 1966846 SAN FRANCISCO MACKEWICZ , CARL E STAFF RESEARCH ASSOC III\n", + "6 1870390 LOS ANGELES ESCUJURI , ERIC JOSEPH TECHNICIAN, SCENE, SR\n", + "7 1771936 BERKELEY JUNG , WOO YONG POSTDOC-EMPLOYEE\n", + "8 1892122 LOS ANGELES SHAPIRO , JORDAN ISAAC TECHNICIAN, SCENE\n", + "12 1988359 SANTA BARBARA CUTLER , CHARLES IAN LABORATORY ASST I" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Look at the updated dataframe\n", "df_ucpay.head()" @@ -311,8 +1143,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:15.702750Z", + "start_time": "2019-06-30T20:29:15.659591Z" + } + }, "outputs": [], "source": [ "state_mask = df_nsf_awards['StateCode'] == 'CA'\n", @@ -321,9 +1158,195 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:16.304034Z", + "start_time": "2019-06-30T20:29:16.236810Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AwardIdFirstNameLastNameStartDateEndDateAwardTitleAwardEffectiveDateAwardExpirationDateNameCityNameZipCodePhoneNumberStreetAddressCountryNameStateNameStateCode
17805989AlbertSchwarz2010-08-26NaNApplication of methods of arithmetic geometry ...2010-09-012014-08-31University of California-DavisDavis95618-00005.307548e+09OR/Sponsored ProgramsUnited StatesCaliforniaCA
40820047FransTax2011-03-28NaNArabidopsis 2010: Global Analysis of Translati...2009-03-012013-02-28University of California-RiversideRIVERSIDE92521-10009.518276e+09Office of ResearchUnited StatesCaliforniaCA
49825254PaulDavis2012-01-27NaNCollaborative Research: A 3D Seismic Study of ...2010-01-012013-09-30University of California-Los AngelesLOS ANGELES90095-20003.107940e+0911000 Kinross Avenue, Suite 211United StatesCaliforniaCA
53830228NicholasMelosh2011-08-06NaNNSEC: CENTER FOR PROBING THE NANOSCALE2009-09-012014-08-31Stanford UniversityPalo Alto94304-12126.507232e+093160 Porter DriveUnited StatesCaliforniaCA
58831132JosephPasquale2010-04-12NaNCollaborative Research; CT-M: Computer Systems...2009-03-012012-12-31University of California-San DiegoLa Jolla92093-09348.585345e+09Office of Contract & Grant AdminUnited StatesCaliforniaCA
\n", + "
" + ], + "text/plain": [ + " AwardId FirstName LastName StartDate EndDate \\\n", + "17 805989 Albert Schwarz 2010-08-26 NaN \n", + "40 820047 Frans Tax 2011-03-28 NaN \n", + "49 825254 Paul Davis 2012-01-27 NaN \n", + "53 830228 Nicholas Melosh 2011-08-06 NaN \n", + "58 831132 Joseph Pasquale 2010-04-12 NaN \n", + "\n", + " AwardTitle AwardEffectiveDate \\\n", + "17 Application of methods of arithmetic geometry ... 2010-09-01 \n", + "40 Arabidopsis 2010: Global Analysis of Translati... 2009-03-01 \n", + "49 Collaborative Research: A 3D Seismic Study of ... 2010-01-01 \n", + "53 NSEC: CENTER FOR PROBING THE NANOSCALE 2009-09-01 \n", + "58 Collaborative Research; CT-M: Computer Systems... 2009-03-01 \n", + "\n", + " AwardExpirationDate Name CityName \\\n", + "17 2014-08-31 University of California-Davis Davis \n", + "40 2013-02-28 University of California-Riverside RIVERSIDE \n", + "49 2013-09-30 University of California-Los Angeles LOS ANGELES \n", + "53 2014-08-31 Stanford University Palo Alto \n", + "58 2012-12-31 University of California-San Diego La Jolla \n", + "\n", + " ZipCode PhoneNumber StreetAddress CountryName \\\n", + "17 95618-0000 5.307548e+09 OR/Sponsored Programs United States \n", + "40 92521-1000 9.518276e+09 Office of Research United States \n", + "49 90095-2000 3.107940e+09 11000 Kinross Avenue, Suite 211 United States \n", + "53 94304-1212 6.507232e+09 3160 Porter Drive United States \n", + "58 92093-0934 8.585345e+09 Office of Contract & Grant Admin United States \n", + "\n", + " StateName StateCode \n", + "17 California CA \n", + "40 California CA \n", + "49 California CA \n", + "53 California CA \n", + "58 California CA " + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_nsf_awards.head()" ] @@ -355,9 +1378,23 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:19.359643Z", + "start_time": "2019-06-30T20:29:19.349544Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['MACKEWICZ , CARL E' 'ESCUJURI , ERIC JOSEPH' 'JUNG , WOO YONG' ...\n", + " 'BUSUTTIL , RONALD W' 'HOWLAND , BENJAMIN CLARK' 'TEDFORD , JEFF']\n" + ] + } + ], "source": [ "# Get all of the values in the \"name\" column in the df_ucpay dataframe \n", "names_ucpay = df_ucpay.name.values\n", @@ -373,9 +1410,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:20.633341Z", + "start_time": "2019-06-30T20:29:20.624167Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Albert' 'Frans' 'Paul' ... 'Wenguang' 'Laurent' 'Benjamin']\n" + ] + } + ], "source": [ "# Get all of the values in the \"FirstName\" column in the df_nsf_awards dataframe\n", "firstnames_nsf_awards = df_nsf_awards.FirstName.values\n", @@ -384,9 +1434,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:21.450620Z", + "start_time": "2019-06-30T20:29:21.436951Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Schwarz' 'Tax' 'Davis' ... 'Sun' 'El Ghaoui' 'Lev']\n" + ] + } + ], "source": [ "# Get all of the values in the \"LastName\" column in the df_nsf_awards dataframe\n", "lastnames_nsf_awards = df_nsf_awards.LastName.values\n", @@ -404,9 +1467,22 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:24.330673Z", + "start_time": "2019-06-30T20:29:24.314937Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['mackewicz ', ' carl e']\n" + ] + } + ], "source": [ "# Take the first name from the UC dataset\n", "test_name = names_ucpay[0]\n", @@ -429,8 +1505,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:27.485413Z", + "start_time": "2019-06-30T20:29:27.428516Z" + } + }, "outputs": [], "source": [ "def split_names(name):\n", @@ -480,8 +1561,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:31.060418Z", + "start_time": "2019-06-30T20:29:29.917927Z" + } + }, "outputs": [], "source": [ "# Apply our function to all the names in the UC dataset\n", @@ -497,8 +1583,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:32.443955Z", + "start_time": "2019-06-30T20:29:32.159706Z" + } + }, "outputs": [], "source": [ "ls_first, ls_middle, ls_last = zip(*ls_cleaned_names)" @@ -506,8 +1597,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 26, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:33.114011Z", + "start_time": "2019-06-30T20:29:32.727934Z" + } + }, "outputs": [], "source": [ "# Put colums in the UC dataset for first, middle, and last name\n", @@ -518,11 +1614,121 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:34.070913Z", + "start_time": "2019-06-30T20:29:34.021451Z" + }, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDcampusnametitlefirstmiddlelast
31966846SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC IIIcarlemackewicz
61870390LOS ANGELESESCUJURI , ERIC JOSEPHTECHNICIAN, SCENE, SRericjosephescujuri
71771936BERKELEYJUNG , WOO YONGPOSTDOC-EMPLOYEEwooyongjung
81892122LOS ANGELESSHAPIRO , JORDAN ISAACTECHNICIAN, SCENEjordanisaacshapiro
121988359SANTA BARBARACUTLER , CHARLES IANLABORATORY ASST Icharlesiancutler
\n", + "
" + ], + "text/plain": [ + " ID campus name title \\\n", + "3 1966846 SAN FRANCISCO MACKEWICZ , CARL E STAFF RESEARCH ASSOC III \n", + "6 1870390 LOS ANGELES ESCUJURI , ERIC JOSEPH TECHNICIAN, SCENE, SR \n", + "7 1771936 BERKELEY JUNG , WOO YONG POSTDOC-EMPLOYEE \n", + "8 1892122 LOS ANGELES SHAPIRO , JORDAN ISAAC TECHNICIAN, SCENE \n", + "12 1988359 SANTA BARBARA CUTLER , CHARLES IAN LABORATORY ASST I \n", + "\n", + " first middle last \n", + "3 carl e mackewicz \n", + "6 eric joseph escujuri \n", + "7 woo yong jung \n", + "8 jordan isaac shapiro \n", + "12 charles ian cutler " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_ucpay.head()" ] @@ -538,8 +1744,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 28, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:36.893652Z", + "start_time": "2019-06-30T20:29:36.791626Z" + } + }, "outputs": [], "source": [ "df_nsf_awards.dropna(subset=['FirstName','LastName'], inplace=True)" @@ -554,8 +1765,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:37.959183Z", + "start_time": "2019-06-30T20:29:37.930545Z" + } + }, "outputs": [], "source": [ "df_nsf_awards['first'] = [unicode(name.lower()) for name in df_nsf_awards['FirstName'].values]\n", @@ -573,11 +1789,208 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:29:41.392713Z", + "start_time": "2019-06-30T20:29:41.320103Z" + }, "scrolled": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AwardIdFirstNameLastNameStartDateEndDateAwardTitleAwardEffectiveDateAwardExpirationDateNameCityNameZipCodePhoneNumberStreetAddressCountryNameStateNameStateCodefirstlast
17805989AlbertSchwarz2010-08-26NaNApplication of methods of arithmetic geometry ...2010-09-012014-08-31University of California-DavisDavis95618-00005.307548e+09OR/Sponsored ProgramsUnited StatesCaliforniaCAalbertschwarz
40820047FransTax2011-03-28NaNArabidopsis 2010: Global Analysis of Translati...2009-03-012013-02-28University of California-RiversideRIVERSIDE92521-10009.518276e+09Office of ResearchUnited StatesCaliforniaCAfranstax
49825254PaulDavis2012-01-27NaNCollaborative Research: A 3D Seismic Study of ...2010-01-012013-09-30University of California-Los AngelesLOS ANGELES90095-20003.107940e+0911000 Kinross Avenue, Suite 211United StatesCaliforniaCApauldavis
53830228NicholasMelosh2011-08-06NaNNSEC: CENTER FOR PROBING THE NANOSCALE2009-09-012014-08-31Stanford UniversityPalo Alto94304-12126.507232e+093160 Porter DriveUnited StatesCaliforniaCAnicholasmelosh
58831132JosephPasquale2010-04-12NaNCollaborative Research; CT-M: Computer Systems...2009-03-012012-12-31University of California-San DiegoLa Jolla92093-09348.585345e+09Office of Contract & Grant AdminUnited StatesCaliforniaCAjosephpasquale
\n", + "
" + ], + "text/plain": [ + " AwardId FirstName LastName StartDate EndDate \\\n", + "17 805989 Albert Schwarz 2010-08-26 NaN \n", + "40 820047 Frans Tax 2011-03-28 NaN \n", + "49 825254 Paul Davis 2012-01-27 NaN \n", + "53 830228 Nicholas Melosh 2011-08-06 NaN \n", + "58 831132 Joseph Pasquale 2010-04-12 NaN \n", + "\n", + " AwardTitle AwardEffectiveDate \\\n", + "17 Application of methods of arithmetic geometry ... 2010-09-01 \n", + "40 Arabidopsis 2010: Global Analysis of Translati... 2009-03-01 \n", + "49 Collaborative Research: A 3D Seismic Study of ... 2010-01-01 \n", + "53 NSEC: CENTER FOR PROBING THE NANOSCALE 2009-09-01 \n", + "58 Collaborative Research; CT-M: Computer Systems... 2009-03-01 \n", + "\n", + " AwardExpirationDate Name CityName \\\n", + "17 2014-08-31 University of California-Davis Davis \n", + "40 2013-02-28 University of California-Riverside RIVERSIDE \n", + "49 2013-09-30 University of California-Los Angeles LOS ANGELES \n", + "53 2014-08-31 Stanford University Palo Alto \n", + "58 2012-12-31 University of California-San Diego La Jolla \n", + "\n", + " ZipCode PhoneNumber StreetAddress CountryName \\\n", + "17 95618-0000 5.307548e+09 OR/Sponsored Programs United States \n", + "40 92521-1000 9.518276e+09 Office of Research United States \n", + "49 90095-2000 3.107940e+09 11000 Kinross Avenue, Suite 211 United States \n", + "53 94304-1212 6.507232e+09 3160 Porter Drive United States \n", + "58 92093-0934 8.585345e+09 Office of Contract & Grant Admin United States \n", + "\n", + " StateName StateCode first last \n", + "17 California CA albert schwarz \n", + "40 California CA frans tax \n", + "49 California CA paul davis \n", + "53 California CA nicholas melosh \n", + "58 California CA joseph pasquale " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "df_nsf_awards.head()" ] @@ -597,8 +2010,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 38, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:31:55.297567Z", + "start_time": "2019-06-30T20:31:55.230080Z" + } + }, "outputs": [], "source": [ "class StringComparators():\n", @@ -628,37 +2046,68 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 39, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:31:56.414063Z", + "start_time": "2019-06-30T20:31:56.402161Z" + } + }, "outputs": [], "source": [ "# Get all of the unique names from NSF and UC \n", - "nsf_firstnames = set( df_nsf_awards['first'].values ) \n", + "nsf_firstnames = set(df_nsf_awards['first'].values) \n", "\n", "# grab the uc_names\n", - "uc_firstnames = df_ucpay['first'].values " + "uc_firstnames = df_ucpay['first'].values" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 40, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:31:56.831857Z", + "start_time": "2019-06-30T20:31:56.819260Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'carl'" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Comparison of records\n", - "testname = unicode(uc_firstnames[0])" + "testname = unicode(uc_firstnames[0])\n", + "testname" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 48, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:01.976639Z", + "start_time": "2019-06-30T20:36:01.934623Z" + } + }, "outputs": [], "source": [ - "# we should document this better and uc_names an argument\n", - "def get_matching_first_name(testname, NUM_NAMES=10):\n", + "def get_matching_first_name(testname, uc_firstnames, NUM_NAMES=10):\n", " \"\"\"\n", - " get top 10 first names that match\n", + " Get top 10 first names from UC that matches with the testname\n", + " \n", + " :param testname: string to test\n", + " :param uc_firstnames: list of names from UC\n", + " :param NUM_NAMES: nth most similar matches\n", + " :return: list with nth most similar matches orderd by similarity\n", " \"\"\"\n", " dict_name_pair = {}\n", " for name in uc_firstnames:\n", @@ -677,22 +2126,72 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 42, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:31.795435Z", + "start_time": "2019-06-30T20:32:31.349966Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "carl ['carl', 'carlo', 'carli', 'carly', 'carla', 'carol', 'caryl', 'carlos', 'carlie', 'carlee']\n" + ] + } + ], "source": [ - "print(testname, get_matching_first_name(testname))" + "print(testname, get_matching_first_name(testname, uc_firstnames))" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for nm in uc_firstnames[:25]:\n", + "execution_count": 43, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:43.569250Z", + "start_time": "2019-06-30T20:32:32.044145Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 carl ['carl', 'carlo', 'carli', 'carly', 'carla', 'carol', 'caryl', 'carlos', 'carlie', 'carlee']\n", + "2 eric ['eric', 'erick', 'erice', 'erich', 'erica', 'erric', 'edric', 'enric', 'ericka', 'ericha']\n", + "3 woo ['woo', 'wood', 'woon', 'wook', 'wo', 'woody', 'woori', 'elwood', 'witoon', 'woojong']\n", + "4 jordan ['jordan', 'jordana', 'jordane', 'jourdan', 'jordin', 'jordon', 'jordaniel', 'jodean', 'joan', 'joraine']\n", + "5 charles ['charles', 'charless', 'charley', 'charese', 'charlee', 'charlie', 'charleston', 'charline', 'charlane', 'charleen']\n", + "6 mark ['mark', 'marko', 'marka', 'marek', 'markos', 'markee', 'markle', 'markim', 'markus', 'marika']\n", + "7 dev ['dev', 'devi', 'deva', 'de', 'devra', 'devie', 'devyn', 'devan', 'devin', 'devon']\n", + "8 jamie ['jamie', 'jamiel', 'jammie', 'jami', 'jaymie', 'jaime', 'jamaine', 'amie', 'jasmine', 'jazmine']\n", + "9 yalei ['yalei', 'yale', 'yanlei', 'yali', 'yaneli', 'yawei', 'yafei', 'yanli', 'yael', 'maylei']\n", + "10 steven ['steven', 'steve', 'steaven', 'stevie', 'stevon', 'stevan', 'sten', 'steven-huy', 'seveen', 'seve']\n", + "11 gregory ['gregory', 'gregor', 'grigory', 'gregorij', 'gregorio', 'gregoria', 'gregoire', 'greg', 'grzegorz', 'grigoriy']\n", + "12 noe ['noe', 'noel', 'nomer', 'noemi', 'noree', 'noemy', 'noeun', 'nokteh', 'noelle', 'noriel']\n", + "13 brooke ['brooke', 'brookie', 'brook', 'brooks', 'booker', 'brooklyn', 'brooklynn', 'brock', 'burke', 'broc']\n", + "14 elena ['elena', 'elenna', 'elen', 'ellena', 'elna', 'helena', 'jelena', 'celena', 'yelena', 'selena']\n", + "15 ken ['ken', 'keen', 'kien', 'kean', 'koen', 'kent', 'keni', 'keun', 'keon', 'ke']\n", + "16 kevin ['kevin', 'kelvin', 'evin', 'kevis', 'keven', 'kevan', 'r.kevin', 'kevin-paul', 'keqin', 'kexin']\n", + "17 victoria ['victoria', 'victorina', 'victori', 'vicktoria', 'victoriana', 'victoriano', 'victorio', 'victor', 'victorya', 'viktoria']\n", + "18 sarah ['sarah', 'sarath', 'sarahy', 'sara', 'sarahann', 'samrrah', 'saurabh', 'safirah', 'saras', 'sarai']\n", + "19 kaitlin ['kaitlin', 'kailin', 'kaitlyn', 'kaili', 'katalin', 'katelin', 'kaetlin', 'katlin', 'caitlin', 'kaiqin']\n", + "20 sabrina ['sabrina', 'sabrin', 'sabina', 'sarina', 'sabra', 'sabin', 'sarbani', 'sarna', 'sharina', 'sebrina']\n", + "21 michele ['michele', 'michelle', 'michaele', 'michel', \"miche-le'\", 'micheline', 'micheal', 'michela', 'michiel', 'michael']\n", + "22 fernando ['fernando', 'fernand', 'fernan', 'fernanda', 'hernando', 'fern', 'fenno', 'ferdinando', 'ferdnand', 'ferda']\n", + "23 michelle ['michelle', 'michele', 'michell', 'mi-chelle', 'mitchelle', 'mischelle', 'michellene', 'michel', 'michaele', 'marichelle']\n", + "24 adi ['adi', 'ardi', 'adib', 'hadi', 'fadi', 'gadi', 'adil', 'adia', 'fadil', 'fadia']\n", + "25 cindy ['cindy', 'cinda', 'cindi', 'cindra', 'cindee', 'cindy-heung', 'cyndy', 'candy', 'cendy', 'lindy']\n" + ] + } + ], + "source": [ + "for i, nm in enumerate(uc_firstnames[:25]):\n", " testname = unicode(nm)\n", - " print(testname, get_matching_first_name(testname))" + " print(i+1, testname, get_matching_first_name(testname, uc_firstnames))" ] }, { @@ -714,8 +2213,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 44, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:48.377777Z", + "start_time": "2019-06-30T20:32:48.345407Z" + } + }, "outputs": [], "source": [ "dict_nsf_awards = df_nsf_awards[:10].to_dict(orient='index')" @@ -723,8 +2227,217 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 45, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:32:49.052801Z", + "start_time": "2019-06-30T20:32:49.009830Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{17: {'AwardEffectiveDate': '2010-09-01',\n", + " 'AwardExpirationDate': '2014-08-31',\n", + " 'AwardId': 805989,\n", + " 'AwardTitle': 'Application of methods of arithmetic geometry and homological algebra to quantum field theory and string theory',\n", + " 'CityName': 'Davis',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Albert',\n", + " 'LastName': 'Schwarz',\n", + " 'Name': 'University of California-Davis',\n", + " 'PhoneNumber': 5307547700.0,\n", + " 'StartDate': '2010-08-26',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': 'OR/Sponsored Programs',\n", + " 'ZipCode': '95618-0000',\n", + " 'first': 'albert',\n", + " 'last': 'schwarz'},\n", + " 40: {'AwardEffectiveDate': '2009-03-01',\n", + " 'AwardExpirationDate': '2013-02-28',\n", + " 'AwardId': 820047,\n", + " 'AwardTitle': 'Arabidopsis 2010: Global Analysis of Translational Regulons',\n", + " 'CityName': 'RIVERSIDE',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Frans',\n", + " 'LastName': 'Tax',\n", + " 'Name': 'University of California-Riverside',\n", + " 'PhoneNumber': 9518275535.0,\n", + " 'StartDate': '2011-03-28',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': 'Office of Research',\n", + " 'ZipCode': '92521-1000',\n", + " 'first': 'frans',\n", + " 'last': 'tax'},\n", + " 49: {'AwardEffectiveDate': '2010-01-01',\n", + " 'AwardExpirationDate': '2013-09-30',\n", + " 'AwardId': 825254,\n", + " 'AwardTitle': 'Collaborative Research: A 3D Seismic Study of the Pacific-North American Plate Boundary in Southern California',\n", + " 'CityName': 'LOS ANGELES',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Paul',\n", + " 'LastName': 'Davis',\n", + " 'Name': 'University of California-Los Angeles',\n", + " 'PhoneNumber': 3107940102.0,\n", + " 'StartDate': '2012-01-27',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': '11000 Kinross Avenue, Suite 211',\n", + " 'ZipCode': '90095-2000',\n", + " 'first': 'paul',\n", + " 'last': 'davis'},\n", + " 53: {'AwardEffectiveDate': '2009-09-01',\n", + " 'AwardExpirationDate': '2014-08-31',\n", + " 'AwardId': 830228,\n", + " 'AwardTitle': 'NSEC: CENTER FOR PROBING THE NANOSCALE',\n", + " 'CityName': 'Palo Alto',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Nicholas',\n", + " 'LastName': 'Melosh',\n", + " 'Name': 'Stanford University',\n", + " 'PhoneNumber': 6507232300.0,\n", + " 'StartDate': '2011-08-06',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': '3160 Porter Drive',\n", + " 'ZipCode': '94304-1212',\n", + " 'first': 'nicholas',\n", + " 'last': 'melosh'},\n", + " 58: {'AwardEffectiveDate': '2009-03-01',\n", + " 'AwardExpirationDate': '2012-12-31',\n", + " 'AwardId': 831132,\n", + " 'AwardTitle': 'Collaborative Research; CT-M: Computer Systems Vulnerabilities and the Efficacy of Defensive Mechanisms',\n", + " 'CityName': 'La Jolla',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Joseph',\n", + " 'LastName': 'Pasquale',\n", + " 'Name': 'University of California-San Diego',\n", + " 'PhoneNumber': 8585344896.0,\n", + " 'StartDate': '2010-04-12',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': 'Office of Contract & Grant Admin',\n", + " 'ZipCode': '92093-0934',\n", + " 'first': 'joseph',\n", + " 'last': 'pasquale'},\n", + " 69: {'AwardEffectiveDate': '2009-09-01',\n", + " 'AwardExpirationDate': '2014-08-31',\n", + " 'AwardId': 832819,\n", + " 'AwardTitle': 'The Center of Integrated Nanomechanical Systems (COINS) Renewal Yrs 6-10',\n", + " 'CityName': 'BERKELEY',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Ronald',\n", + " 'LastName': 'Fearing',\n", + " 'Name': 'University of California-Berkeley',\n", + " 'PhoneNumber': 5106428109.0,\n", + " 'StartDate': '2010-02-25',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': 'Sponsored Projects Office',\n", + " 'ZipCode': '94704-5940',\n", + " 'first': 'ronald',\n", + " 'last': 'fearing'},\n", + " 73: {'AwardEffectiveDate': '2009-01-01',\n", + " 'AwardExpirationDate': '2014-12-31',\n", + " 'AwardId': 833340,\n", + " 'AwardTitle': 'Robert Noyce Teacher Scholars Program',\n", + " 'CityName': 'SANTA CRUZ',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Gretchen',\n", + " 'LastName': 'Andreasen',\n", + " 'Name': 'University of California-Santa Cruz',\n", + " 'PhoneNumber': 8314595278.0,\n", + " 'StartDate': '2011-07-19',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': '1156 High Street',\n", + " 'ZipCode': '95064-1077',\n", + " 'first': 'gretchen',\n", + " 'last': 'andreasen'},\n", + " 103: {'AwardEffectiveDate': '2009-01-01',\n", + " 'AwardExpirationDate': '2014-09-30',\n", + " 'AwardId': 836152,\n", + " 'AwardTitle': 'The Sondrestrom Upper Atmospheric Research Facility: Research, Operation and Coordination',\n", + " 'CityName': 'MENLO PARK',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': '2013-05-30',\n", + " 'FirstName': 'Craig',\n", + " 'LastName': 'Heinselman',\n", + " 'Name': 'SRI International',\n", + " 'PhoneNumber': 6508592651.0,\n", + " 'StartDate': '2012-12-20',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': '333 RAVENSWOOD AVE',\n", + " 'ZipCode': '94025-3493',\n", + " 'first': 'craig',\n", + " 'last': 'heinselman'},\n", + " 104: {'AwardEffectiveDate': '2009-01-01',\n", + " 'AwardExpirationDate': '2014-09-30',\n", + " 'AwardId': 836152,\n", + " 'AwardTitle': 'The Sondrestrom Upper Atmospheric Research Facility: Research, Operation and Coordination',\n", + " 'CityName': 'MENLO PARK',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': '2013-05-30',\n", + " 'FirstName': 'Mary',\n", + " 'LastName': 'McCready',\n", + " 'Name': 'SRI International',\n", + " 'PhoneNumber': 6508592651.0,\n", + " 'StartDate': '2012-02-10',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': '333 RAVENSWOOD AVE',\n", + " 'ZipCode': '94025-3493',\n", + " 'first': 'mary',\n", + " 'last': 'mccready'},\n", + " 131: {'AwardEffectiveDate': '2009-07-01',\n", + " 'AwardExpirationDate': '2013-06-30',\n", + " 'AwardId': 838258,\n", + " 'AwardTitle': 'Collaborative Research: Astronomy with CARMA',\n", + " 'CityName': 'BERKELEY',\n", + " 'CountryName': 'United States',\n", + " 'EndDate': nan,\n", + " 'FirstName': 'Leo',\n", + " 'LastName': 'Blitz',\n", + " 'Name': 'University of California-Berkeley',\n", + " 'PhoneNumber': 5106428109.0,\n", + " 'StartDate': '2011-03-07',\n", + " 'StateCode': 'CA',\n", + " 'StateName': 'California',\n", + " 'StreetAddress': 'Sponsored Projects Office',\n", + " 'ZipCode': '94704-5940',\n", + " 'first': 'leo',\n", + " 'last': 'blitz'}}" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dict_nsf_awards" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:17.939991Z", + "start_time": "2019-06-30T20:36:17.894355Z" + } + }, "outputs": [], "source": [ "def create_rule_mask(nsf_first_name, \n", @@ -766,8 +2479,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 50, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:19.185824Z", + "start_time": "2019-06-30T20:36:19.094814Z" + } + }, "outputs": [], "source": [ "def match_records(dict_nsf_awards, df_ucpay, f_create_rule_mask):\n", @@ -802,30 +2520,279 @@ " df_matches = df_ucpay[jaro_mask]\n", " if len(df_matches) == 0:\n", " print('No Match: {} {}'.format(nsf_first_name,nsf_last_name))\n", - " for row in df_matches.iterrows():\n", - " dict_test_row['ID'] = row[1]['ID']\n", - " dict_test_row['campus'] = row[1]['campus']\n", - " dict_test_row['name'] = row[1]['name']\n", - " dict_test_row['title'] = row[1]['title']\n", - " df_linked_data = df_linked_data.append(dict_test_row, ignore_index=True)\n", + " else:\n", + " for row in df_matches.iterrows():\n", + " dict_test_row['ID'] = row[1]['ID']\n", + " dict_test_row['campus'] = row[1]['campus']\n", + " dict_test_row['name'] = row[1]['name']\n", + " dict_test_row['title'] = row[1]['title']\n", + " df_linked_data = df_linked_data.append(dict_test_row, ignore_index=True)\n", " \n", " return df_linked_data" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 51, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:28.469466Z", + "start_time": "2019-06-30T20:36:20.301998Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No Match: nicholas melosh\n", + "No Match: frans tax\n", + "No Match: craig heinselman\n", + "No Match: mary mccready\n" + ] + } + ], "source": [ "df_linked_data = match_records(dict_nsf_awards, df_ucpay, create_rule_mask )" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 52, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T20:36:28.556271Z", + "start_time": "2019-06-30T20:36:28.474144Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AwardIdCityNameFirstNameIDLastNameNamecampustitlefirstlast
0805989.0DavisAlbert1814365.0SchwarzUniversity of California-DavisDAVISPROFESSOR - ACADEMIC YEARalbertschwarz
1838258.0BERKELEYLeo1765982.0BlitzUniversity of California-BerkeleyBERKELEYPROFESSOR - ACADEMIC YEARleoblitz
2825254.0LOS ANGELESPaul1832316.0DavisUniversity of California-Los AngelesIRVINEANALYST, ADMINISTRATIVE, ASSTpauldavis
3825254.0LOS ANGELESPaul1832352.0DavisUniversity of California-Los AngelesIRVINEHOSPITAL UNIT SERV COORD IIpauldavis
4825254.0LOS ANGELESPaul1800777.0DavisUniversity of California-Los AngelesDAVISRESEARCH _____ - FISCAL YEARpauldavis
5825254.0LOS ANGELESPaul1868502.0DavisUniversity of California-Los AngelesLOS ANGELESPROGRAMMER/ANALYST IV - SUPERVpauldavis
6825254.0LOS ANGELESPaul1868593.0DavisUniversity of California-Los AngelesLOS ANGELESPROFESSOR - ACADEMIC YEARpauldavis
7833340.0SANTA CRUZGretchen1999814.0AndreasenUniversity of California-Santa CruzSANTA CRUZADMIN/COORD/OFFICER(FUNC AREA)gretchenandreasen
8831132.0La JollaJoseph1887173.0PasqualeUniversity of California-San DiegoLOS ANGELESREPRESENTATIVE, ACCESS, PRINjosephpasquale
9831132.0La JollaJoseph1942864.0PasqualeUniversity of California-San DiegoSAN DIEGOPROFESSOR-ACAD YR-BUS/ECON/ENGjosephpasquale
10832819.0BERKELEYRonald1768965.0FearingUniversity of California-BerkeleyBERKELEYPROFESSOR-ACAD YR-BUS/ECON/ENGronaldfearing
\n", + "
" + ], + "text/plain": [ + " AwardId CityName FirstName ID LastName \\\n", + "0 805989.0 Davis Albert 1814365.0 Schwarz \n", + "1 838258.0 BERKELEY Leo 1765982.0 Blitz \n", + "2 825254.0 LOS ANGELES Paul 1832316.0 Davis \n", + "3 825254.0 LOS ANGELES Paul 1832352.0 Davis \n", + "4 825254.0 LOS ANGELES Paul 1800777.0 Davis \n", + "5 825254.0 LOS ANGELES Paul 1868502.0 Davis \n", + "6 825254.0 LOS ANGELES Paul 1868593.0 Davis \n", + "7 833340.0 SANTA CRUZ Gretchen 1999814.0 Andreasen \n", + "8 831132.0 La Jolla Joseph 1887173.0 Pasquale \n", + "9 831132.0 La Jolla Joseph 1942864.0 Pasquale \n", + "10 832819.0 BERKELEY Ronald 1768965.0 Fearing \n", + "\n", + " Name campus \\\n", + "0 University of California-Davis DAVIS \n", + "1 University of California-Berkeley BERKELEY \n", + "2 University of California-Los Angeles IRVINE \n", + "3 University of California-Los Angeles IRVINE \n", + "4 University of California-Los Angeles DAVIS \n", + "5 University of California-Los Angeles LOS ANGELES \n", + "6 University of California-Los Angeles LOS ANGELES \n", + "7 University of California-Santa Cruz SANTA CRUZ \n", + "8 University of California-San Diego LOS ANGELES \n", + "9 University of California-San Diego SAN DIEGO \n", + "10 University of California-Berkeley BERKELEY \n", + "\n", + " title first last \n", + "0 PROFESSOR - ACADEMIC YEAR albert schwarz \n", + "1 PROFESSOR - ACADEMIC YEAR leo blitz \n", + "2 ANALYST, ADMINISTRATIVE, ASST paul davis \n", + "3 HOSPITAL UNIT SERV COORD II paul davis \n", + "4 RESEARCH _____ - FISCAL YEAR paul davis \n", + "5 PROGRAMMER/ANALYST IV - SUPERV paul davis \n", + "6 PROFESSOR - ACADEMIC YEAR paul davis \n", + "7 ADMIN/COORD/OFFICER(FUNC AREA) gretchen andreasen \n", + "8 REPRESENTATIVE, ACCESS, PRIN joseph pasquale \n", + "9 PROFESSOR-ACAD YR-BUS/ECON/ENG joseph pasquale \n", + "10 PROFESSOR-ACAD YR-BUS/ECON/ENG ronald fearing " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "selected_columns = ['AwardId', 'CityName', 'FirstName', 'ID', 'LastName', 'Name', 'campus', 'title', 'first', 'last']\n", "df_linked_data[selected_columns]" @@ -864,13 +2831,20 @@ "source": [ "The final way is matching records *probabilistically*. The **Fellegi-Sunter** model compares selected similiar fields in two records and calculates a similarity score, or a weighted probablity of the two records being the same entity. \n", "\n", - "The algorithm is the following: two fields are first compared using a metric, in this case, the Jaro-Winkler distance (which can be between 0 and 1). The Jaro-Winkler distance is binned into one of three categories: exact match, close match, or no match. Each category has an associated distribution, based on known matches and unmatches. The log probability of being a match and the log probability of being a non-match are calculated for each pair. The final score is the log probablity of being a match minus the log probablity of being a non-match. If the final score is greater than a threshold, then the records are considered to match." + "The algorithm is the following: two fields are first compared using a metric, in this case, the Jaro-Winkler distance (which can be between 0 and 1). The Jaro-Winkler distance is binned into one of three categories: exact match, close match, or no match. Each category has an associated distribution, based on known matches and unmatches. The log probability of being a match and the log probability of being a non-match are calculated for each pair. The final score is the log probablity of being a match minus the log probablity of being a non-match. If the final score is greater than a threshold, then the records are considered to match.\n", + "\n", + "There is already an implementation in python for the Fellegi-Sunter model in the **recordlinkage** package `pip install recordlinkage`." ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 74, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:00:59.871185Z", + "start_time": "2019-06-30T21:00:59.526748Z" + } + }, "outputs": [], "source": [ "# We should probably just make these simple functions instead of objects\n", @@ -894,7 +2868,7 @@ " \n", " * exact match is a jaro-winkler score >= 0.92\n", " * close match is a jaro-winkler score > 0.85\n", - " * no match is a jaro-winkler score < 0.85\n", + " * no match is a jaro-winkler score <= 0.85\n", " \n", " Parameters\n", " ----------\n", @@ -946,9 +2920,9 @@ " #grab the m and u weights\n", " \n", " first_name_m_weight = self.m_weights['first_name'][first_name_score]\n", - " first_name_u_weight = self.u_weights['last_name'][first_name_score]\n", + " first_name_u_weight = self.u_weights['first_name'][first_name_score]\n", " \n", - " last_name_m_weight = self.m_weights['first_name'][last_name_score]\n", + " last_name_m_weight = self.m_weights['last_name'][last_name_score]\n", " last_name_u_weight = self.u_weights['last_name'][last_name_score]\n", " \n", " log_prob_match = math.log(first_name_m_weight) + math.log(last_name_m_weight)\n", @@ -987,8 +2961,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 75, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:00.290652Z", + "start_time": "2019-06-30T21:01:00.283463Z" + } + }, "outputs": [], "source": [ "fs = FellegiSunter()" @@ -996,27 +2975,68 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 76, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:00.790494Z", + "start_time": "2019-06-30T21:01:00.783007Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + } + ], "source": [ "print( fs.link_record(('Avishek','Kumar'), ('Avishek','Kumar')) )" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 77, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:01.352939Z", + "start_time": "2019-06-30T21:01:01.342885Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "False\n" + ] + } + ], "source": [ "print( fs.link_record( ('Avishek','Kumar'), ('Anup','Kumar') ) )" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 78, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:01:04.770898Z", + "start_time": "2019-06-30T21:01:04.756602Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "jonathon jonthon 2\n", + "john mark 0\n", + "fred frederick 1\n" + ] + } + ], "source": [ "#let's take this new function for a spin\n", "print('jonathon', 'jonthon', fs.fuzzy_match('jonathon','jonthon') )\n", @@ -1026,8 +3046,13 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 79, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:06:27.846712Z", + "start_time": "2019-06-30T21:06:27.813922Z" + } + }, "outputs": [], "source": [ "def create_jaro_mask(nsf_first_name, nsf_last_name, df_ucpay):\n", @@ -1060,20 +3085,314 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 80, + "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:07:02.333067Z", + "start_time": "2019-06-30T21:06:40.122671Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No Match: nicholas melosh\n", + "No Match: frans tax\n", + "No Match: craig heinselman\n" + ] + } + ], "source": [ "df_linked_data = match_records(dict_nsf_awards, df_ucpay, create_jaro_mask )" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, "metadata": { + "ExecuteTime": { + "end_time": "2019-06-30T21:07:17.414495Z", + "start_time": "2019-06-30T21:07:17.351252Z" + }, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AwardIdCityNameFirstNameIDLastNameNamecampustitlefirstlast
0805989.0DavisAlbert1814365.0SchwarzUniversity of California-DavisDAVISPROFESSOR - ACADEMIC YEARalbertschwarz
1838258.0BERKELEYLeo1765982.0BlitzUniversity of California-BerkeleyBERKELEYPROFESSOR - ACADEMIC YEARleoblitz
2825254.0LOS ANGELESPaul1832316.0DavisUniversity of California-Los AngelesIRVINEANALYST, ADMINISTRATIVE, ASSTpauldavis
3825254.0LOS ANGELESPaul1832352.0DavisUniversity of California-Los AngelesIRVINEHOSPITAL UNIT SERV COORD IIpauldavis
4825254.0LOS ANGELESPaul1800777.0DavisUniversity of California-Los AngelesDAVISRESEARCH _____ - FISCAL YEARpauldavis
5825254.0LOS ANGELESPaul1868502.0DavisUniversity of California-Los AngelesLOS ANGELESPROGRAMMER/ANALYST IV - SUPERVpauldavis
6825254.0LOS ANGELESPaul1868593.0DavisUniversity of California-Los AngelesLOS ANGELESPROFESSOR - ACADEMIC YEARpauldavis
7833340.0SANTA CRUZGretchen1999814.0AndreasenUniversity of California-Santa CruzSANTA CRUZADMIN/COORD/OFFICER(FUNC AREA)gretchenandreasen
8831132.0La JollaJoseph1887173.0PasqualeUniversity of California-San DiegoLOS ANGELESREPRESENTATIVE, ACCESS, PRINjosephpasquale
9831132.0La JollaJoseph1942864.0PasqualeUniversity of California-San DiegoSAN DIEGOPROFESSOR-ACAD YR-BUS/ECON/ENGjosephpasquale
10836152.0MENLO PARKMary1774629.0McCreadySRI InternationalBERKELEYACADEMIC COORD II-ACADEMIC YRmarymccready
11836152.0MENLO PARKMary1837591.0McCreadySRI InternationalIRVINESENIOR FINANCE ANALYSTmarymccready
12836152.0MENLO PARKMary1944081.0McCreadySRI InternationalSAN DIEGOMANAGER, CASEmarymccready
13832819.0BERKELEYRonald1768965.0FearingUniversity of California-BerkeleyBERKELEYPROFESSOR-ACAD YR-BUS/ECON/ENGronaldfearing
\n", + "
" + ], + "text/plain": [ + " AwardId CityName FirstName ID LastName \\\n", + "0 805989.0 Davis Albert 1814365.0 Schwarz \n", + "1 838258.0 BERKELEY Leo 1765982.0 Blitz \n", + "2 825254.0 LOS ANGELES Paul 1832316.0 Davis \n", + "3 825254.0 LOS ANGELES Paul 1832352.0 Davis \n", + "4 825254.0 LOS ANGELES Paul 1800777.0 Davis \n", + "5 825254.0 LOS ANGELES Paul 1868502.0 Davis \n", + "6 825254.0 LOS ANGELES Paul 1868593.0 Davis \n", + "7 833340.0 SANTA CRUZ Gretchen 1999814.0 Andreasen \n", + "8 831132.0 La Jolla Joseph 1887173.0 Pasquale \n", + "9 831132.0 La Jolla Joseph 1942864.0 Pasquale \n", + "10 836152.0 MENLO PARK Mary 1774629.0 McCready \n", + "11 836152.0 MENLO PARK Mary 1837591.0 McCready \n", + "12 836152.0 MENLO PARK Mary 1944081.0 McCready \n", + "13 832819.0 BERKELEY Ronald 1768965.0 Fearing \n", + "\n", + " Name campus \\\n", + "0 University of California-Davis DAVIS \n", + "1 University of California-Berkeley BERKELEY \n", + "2 University of California-Los Angeles IRVINE \n", + "3 University of California-Los Angeles IRVINE \n", + "4 University of California-Los Angeles DAVIS \n", + "5 University of California-Los Angeles LOS ANGELES \n", + "6 University of California-Los Angeles LOS ANGELES \n", + "7 University of California-Santa Cruz SANTA CRUZ \n", + "8 University of California-San Diego LOS ANGELES \n", + "9 University of California-San Diego SAN DIEGO \n", + "10 SRI International BERKELEY \n", + "11 SRI International IRVINE \n", + "12 SRI International SAN DIEGO \n", + "13 University of California-Berkeley BERKELEY \n", + "\n", + " title first last \n", + "0 PROFESSOR - ACADEMIC YEAR albert schwarz \n", + "1 PROFESSOR - ACADEMIC YEAR leo blitz \n", + "2 ANALYST, ADMINISTRATIVE, ASST paul davis \n", + "3 HOSPITAL UNIT SERV COORD II paul davis \n", + "4 RESEARCH _____ - FISCAL YEAR paul davis \n", + "5 PROGRAMMER/ANALYST IV - SUPERV paul davis \n", + "6 PROFESSOR - ACADEMIC YEAR paul davis \n", + "7 ADMIN/COORD/OFFICER(FUNC AREA) gretchen andreasen \n", + "8 REPRESENTATIVE, ACCESS, PRIN joseph pasquale \n", + "9 PROFESSOR-ACAD YR-BUS/ECON/ENG joseph pasquale \n", + "10 ACADEMIC COORD II-ACADEMIC YR mary mccready \n", + "11 SENIOR FINANCE ANALYST mary mccready \n", + "12 MANAGER, CASE mary mccready \n", + "13 PROFESSOR-ACAD YR-BUS/ECON/ENG ronald fearing " + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sel_col = ['AwardId', 'CityName', 'FirstName', 'ID', 'LastName', 'Name', 'campus', 'title', 'first', 'last']\n", "df_linked_data[sel_col]" @@ -1083,7 +3402,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here is the matching using probablistic matching. We can change the thresholds do see how results will vary." + "Here is the matching using probablistic matching. We can change the thresholds to see how results will vary." ] }, { @@ -1112,7 +3431,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.5.2" } }, "nbformat": 4, From bc19fbbffd40b74b78b4f02731e245ad5e261b67 Mon Sep 17 00:00:00 2001 From: silil Date: Sun, 30 Jun 2019 22:33:28 +0100 Subject: [PATCH 2/3] fix bug --- .../record-linkage/RecordLinkage.ipynb | 86 ++++++++++--------- 1 file changed, 47 insertions(+), 39 deletions(-) diff --git a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb index 5af506bf..d65b7298 100644 --- a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb +++ b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb @@ -42,11 +42,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:39.182112Z", - "start_time": "2019-06-30T20:28:37.683186Z" + "end_time": "2019-06-30T21:32:20.723952Z", + "start_time": "2019-06-30T21:32:19.173665Z" } }, "outputs": [ @@ -86,11 +86,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:42.776555Z", - "start_time": "2019-06-30T20:28:42.093983Z" + "end_time": "2019-06-30T21:32:21.906272Z", + "start_time": "2019-06-30T21:32:21.242534Z" } }, "outputs": [], @@ -101,11 +101,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:43.497302Z", - "start_time": "2019-06-30T20:28:43.419318Z" + "end_time": "2019-06-30T21:32:22.079207Z", + "start_time": "2019-06-30T21:32:21.990349Z" } }, "outputs": [ @@ -285,7 +285,7 @@ "4 United States District of Columbia DC " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -304,11 +304,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:48.594425Z", - "start_time": "2019-06-30T20:28:47.255452Z" + "end_time": "2019-06-30T21:32:25.168908Z", + "start_time": "2019-06-30T21:32:23.677475Z" } }, "outputs": [], @@ -319,11 +319,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:48.640269Z", - "start_time": "2019-06-30T20:28:48.599594Z" + "end_time": "2019-06-30T21:32:25.221344Z", + "start_time": "2019-06-30T21:32:25.173120Z" } }, "outputs": [ @@ -446,7 +446,7 @@ "4 0.73 0.0 0.0 0 0 " ] }, - "execution_count": 5, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -480,11 +480,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:52.265775Z", - "start_time": "2019-06-30T20:28:52.251769Z" + "end_time": "2019-06-30T21:32:28.641015Z", + "start_time": "2019-06-30T21:32:28.628761Z" } }, "outputs": [ @@ -494,7 +494,7 @@ "array([2011])" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -513,11 +513,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:55.133262Z", - "start_time": "2019-06-30T20:28:55.077549Z" + "end_time": "2019-06-30T21:32:29.743462Z", + "start_time": "2019-06-30T21:32:29.690222Z" } }, "outputs": [ @@ -529,7 +529,7 @@ " 'SAN DIEGO', 'UCOP'], dtype=object)" ] }, - "execution_count": 7, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -548,30 +548,38 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "metadata": { "ExecuteTime": { - "end_time": "2019-06-30T20:28:59.381824Z", - "start_time": "2019-06-30T20:28:58.797922Z" + "end_time": "2019-06-30T21:32:58.356538Z", + "start_time": "2019-06-30T21:32:57.678888Z" } }, "outputs": [ { - "ename": "AttributeError", - "evalue": "'Series' object has no attribute 'sorted_values'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Look at number of entries by campus in the dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf_ucpay\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgroupby\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'campus'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msize\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msorted_values\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mascending\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mplot\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkind\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'barh'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/.pyenv/versions/3.5.2/envs/python_3/lib/python3.5/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 4374\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4375\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4376\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4377\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4378\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mAttributeError\u001b[0m: 'Series' object has no attribute 'sorted_values'" - ] + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ "# Look at number of entries by campus in the dataset\n", - "df_ucpay.groupby('campus').size().sorted_values(ascending=False).plot(kind='barh')" + "df_ucpay.groupby('campus').size().plot(kind='barh')" ] }, { From 89bec57296f081cb89193ba681dbd9e020962d8f Mon Sep 17 00:00:00 2001 From: silil Date: Wed, 3 Jul 2019 10:35:42 +0100 Subject: [PATCH 3/3] clear outputs --- .../record-linkage/RecordLinkage.ipynb | 2223 +---------------- 1 file changed, 80 insertions(+), 2143 deletions(-) diff --git a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb index d65b7298..b4ac45f3 100644 --- a/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb +++ b/sources/curriculum/2_data_exploration_and_analysis/record-linkage/RecordLinkage.ipynb @@ -42,22 +42,14 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:20.723952Z", "start_time": "2019-06-30T21:32:19.173665Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Populating the interactive namespace from numpy and matplotlib\n" - ] - } - ], + "outputs": [], "source": [ "%pylab inline\n", "from __future__ import print_function\n", @@ -86,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:21.906272Z", @@ -101,195 +93,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:22.079207Z", "start_time": "2019-06-30T21:32:21.990349Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AwardIdFirstNameLastNameStartDateEndDateAwardTitleAwardEffectiveDateAwardExpirationDateNameCityNameZipCodePhoneNumberStreetAddressCountryNameStateNameStateCode
0415302JeffreyKuhn2010-01-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
1415302RobertRosner2010-01-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
2415302PhilipGoode2010-01-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
3415302ThomasRimmele2012-03-15NaNAdvanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
4415302StephenKeil2010-01-152012-03-15Advanced Technology Solar Telescope (ATST) Con...2010-01-012015-09-30Association of Universities for Research in As...Washington20005-39292.024832e+091212 New York Avenue, N.W.,United StatesDistrict of ColumbiaDC
\n", - "
" - ], - "text/plain": [ - " AwardId FirstName LastName StartDate EndDate \\\n", - "0 415302 Jeffrey Kuhn 2010-01-15 NaN \n", - "1 415302 Robert Rosner 2010-01-15 NaN \n", - "2 415302 Philip Goode 2010-01-15 NaN \n", - "3 415302 Thomas Rimmele 2012-03-15 NaN \n", - "4 415302 Stephen Keil 2010-01-15 2012-03-15 \n", - "\n", - " AwardTitle AwardEffectiveDate \\\n", - "0 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", - "1 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", - "2 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", - "3 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", - "4 Advanced Technology Solar Telescope (ATST) Con... 2010-01-01 \n", - "\n", - " AwardExpirationDate Name \\\n", - "0 2015-09-30 Association of Universities for Research in As... \n", - "1 2015-09-30 Association of Universities for Research in As... \n", - "2 2015-09-30 Association of Universities for Research in As... \n", - "3 2015-09-30 Association of Universities for Research in As... \n", - "4 2015-09-30 Association of Universities for Research in As... \n", - "\n", - " CityName ZipCode PhoneNumber StreetAddress \\\n", - "0 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", - "1 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", - "2 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", - "3 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", - "4 Washington 20005-3929 2.024832e+09 1212 New York Avenue, N.W., \n", - "\n", - " CountryName StateName StateCode \n", - "0 United States District of Columbia DC \n", - "1 United States District of Columbia DC \n", - "2 United States District of Columbia DC \n", - "3 United States District of Columbia DC \n", - "4 United States District of Columbia DC " - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Take a first look at the data\n", "df_nsf_awards.head()" @@ -304,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:25.168908Z", @@ -319,138 +130,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:25.221344Z", "start_time": "2019-06-30T21:32:25.173120Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDyearcampusnametitlegrossbaseovertimeextraexclude
017519712011BERKELEY***********TUTOR - NON-GSHIP0.490.00.000
117589842011BERKELEY***********TUTOR - NON-GSHIP0.490.00.000
218215852011IRVINE***********TUTOR - NON-GSHIP0.510.00.000
319668462011SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC III0.620.00.000
417589472011BERKELEY***********READER - NON-GSHIP0.730.00.000
\n", - "
" - ], - "text/plain": [ - " ID year campus name title \\\n", - "0 1751971 2011 BERKELEY *********** TUTOR - NON-GSHIP \n", - "1 1758984 2011 BERKELEY *********** TUTOR - NON-GSHIP \n", - "2 1821585 2011 IRVINE *********** TUTOR - NON-GSHIP \n", - "3 1966846 2011 SAN FRANCISCO MACKEWICZ , CARL E STAFF RESEARCH ASSOC III \n", - "4 1758947 2011 BERKELEY *********** READER - NON-GSHIP \n", - "\n", - " gross base overtime extra exclude \n", - "0 0.49 0.0 0.0 0 0 \n", - "1 0.49 0.0 0.0 0 0 \n", - "2 0.51 0.0 0.0 0 0 \n", - "3 0.62 0.0 0.0 0 0 \n", - "4 0.73 0.0 0.0 0 0 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Look at what the UC data contains\n", "df_ucpay.head()" @@ -480,25 +167,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:28.641015Z", "start_time": "2019-06-30T21:32:28.628761Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([2011])" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Get all unique entries in the 'year' column\n", "df_ucpay.year.unique()" @@ -513,27 +189,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:29.743462Z", "start_time": "2019-06-30T21:32:29.690222Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['BERKELEY', 'IRVINE', 'SAN FRANCISCO', 'LOS ANGELES', 'DANR',\n", - " 'SANTA BARBARA', 'SANTA CRUZ', 'RIVERSIDE', 'DAVIS', 'MERCED',\n", - " 'SAN DIEGO', 'UCOP'], dtype=object)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Get all unique entries in the 'campus' column\n", "df_ucpay.campus.unique()" @@ -548,35 +211,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:32:58.356538Z", "start_time": "2019-06-30T21:32:57.678888Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# Look at number of entries by campus in the dataset\n", "df_ucpay.groupby('campus').size().plot(kind='barh')" @@ -584,28 +226,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:28:59.522668Z", "start_time": "2019-06-30T20:28:59.452969Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "array(['TUTOR - NON-GSHIP', 'STAFF RESEARCH ASSOC III',\n", - " 'READER - NON-GSHIP', ..., 'ATHLETICS MANAGER 4',\n", - " 'CHIEF EXEC OFFICER - MED CENTR', 'TREASURER OF THE REGENTS'],\n", - " dtype=object)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Get all unique titles\n", "df_ucpay.title.unique()" @@ -613,25 +241,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:00.564528Z", "start_time": "2019-06-30T20:29:00.485584Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "2626" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Find out how many unique titles are present in the data\n", "len(df_ucpay.title.unique())" @@ -646,25 +263,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:03.109872Z", "start_time": "2019-06-30T20:29:03.099584Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "(259043, 10)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Get number of rows and columns of UC dataset\n", "df_ucpay.shape" @@ -681,25 +287,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:05.748041Z", "start_time": "2019-06-30T20:29:05.626151Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "(163429, 10)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Use a mask to keep only entries that do NOT have stars instead of a name\n", "mask = df_ucpay.name != \"***********\" \n", @@ -717,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:08.142006Z", @@ -732,288 +327,14 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:08.923390Z", "start_time": "2019-06-30T20:29:08.825038Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDyearcampusnametitlegrossbaseovertimeextraexclude
319668462011SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC III0.620.000.000
618703902011LOS ANGELESESCUJURI , ERIC JOSEPHTECHNICIAN, SCENE, SR1.140.000.000
717719362011BERKELEYJUNG , WOO YONGPOSTDOC-EMPLOYEE1.281.280.000
818921222011LOS ANGELESSHAPIRO , JORDAN ISAACTECHNICIAN, SCENE1.330.000.000
1219883592011SANTA BARBARACUTLER , CHARLES IANLABORATORY ASST I1.550.000.000
1518618692011LOS ANGELESANDERSON , MARK CALVINTECHNICIAN, SCENE, SR1.830.000.000
2518871912011LOS ANGELESPATEL , DEV KAPILASSISTANT IV2.192.190.000
2618754262011LOS ANGELESHILDER , JAMIE LPOSTDOC-EMPLOYEE2.202.200.000
4619772092011SAN FRANCISCOWU , YALEIPOSTDOC-EMPLOYEE2.842.840.000
7318958652011LOS ANGELESVALERIO , STEVEN GERARD,JRTECHNICIAN, SCENE4.240.000.000
7419546032011SAN FRANCISCOBAILEY , GREGORY D____ASSISTANT, HOSPITAL, II4.240.000.000
8119308962011SAN DIEGOCRESPO , NOE CUAUHTEMOCPOSTDOC-EMPLOYEE4.494.490.000
8318129402011DAVISRENO , BROOKEASSISTANT III4.634.630.000
9119449912011SAN DIEGORUBIO DE LA TOR , ELENAPOSTDOC-EMPLOYEE5.195.190.000
9418153382011DAVISSMITH , KEN APROGRAMMER VII - SUPV5.290.000.000
\n", - "
" - ], - "text/plain": [ - " ID year campus name \\\n", - "3 1966846 2011 SAN FRANCISCO MACKEWICZ , CARL E \n", - "6 1870390 2011 LOS ANGELES ESCUJURI , ERIC JOSEPH \n", - "7 1771936 2011 BERKELEY JUNG , WOO YONG \n", - "8 1892122 2011 LOS ANGELES SHAPIRO , JORDAN ISAAC \n", - "12 1988359 2011 SANTA BARBARA CUTLER , CHARLES IAN \n", - "15 1861869 2011 LOS ANGELES ANDERSON , MARK CALVIN \n", - "25 1887191 2011 LOS ANGELES PATEL , DEV KAPIL \n", - "26 1875426 2011 LOS ANGELES HILDER , JAMIE L \n", - "46 1977209 2011 SAN FRANCISCO WU , YALEI \n", - "73 1895865 2011 LOS ANGELES VALERIO , STEVEN GERARD,JR \n", - "74 1954603 2011 SAN FRANCISCO BAILEY , GREGORY D \n", - "81 1930896 2011 SAN DIEGO CRESPO , NOE CUAUHTEMOC \n", - "83 1812940 2011 DAVIS RENO , BROOKE \n", - "91 1944991 2011 SAN DIEGO RUBIO DE LA TOR , ELENA \n", - "94 1815338 2011 DAVIS SMITH , KEN A \n", - "\n", - " title gross base overtime extra exclude \n", - "3 STAFF RESEARCH ASSOC III 0.62 0.00 0.0 0 0 \n", - "6 TECHNICIAN, SCENE, SR 1.14 0.00 0.0 0 0 \n", - "7 POSTDOC-EMPLOYEE 1.28 1.28 0.0 0 0 \n", - "8 TECHNICIAN, SCENE 1.33 0.00 0.0 0 0 \n", - "12 LABORATORY ASST I 1.55 0.00 0.0 0 0 \n", - "15 TECHNICIAN, SCENE, SR 1.83 0.00 0.0 0 0 \n", - "25 ASSISTANT IV 2.19 2.19 0.0 0 0 \n", - "26 POSTDOC-EMPLOYEE 2.20 2.20 0.0 0 0 \n", - "46 POSTDOC-EMPLOYEE 2.84 2.84 0.0 0 0 \n", - "73 TECHNICIAN, SCENE 4.24 0.00 0.0 0 0 \n", - "74 ____ASSISTANT, HOSPITAL, II 4.24 0.00 0.0 0 0 \n", - "81 POSTDOC-EMPLOYEE 4.49 4.49 0.0 0 0 \n", - "83 ASSISTANT III 4.63 4.63 0.0 0 0 \n", - "91 POSTDOC-EMPLOYEE 5.19 5.19 0.0 0 0 \n", - "94 PROGRAMMER VII - SUPV 5.29 0.00 0.0 0 0 " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Look at the first 15 entries in the updated dataset with redacted names removed\n", "df_ucpay.head(15)" @@ -1030,7 +351,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:12.209908Z", @@ -1048,95 +369,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:12.859866Z", "start_time": "2019-06-30T20:29:12.818769Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDcampusnametitle
31966846SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC III
61870390LOS ANGELESESCUJURI , ERIC JOSEPHTECHNICIAN, SCENE, SR
71771936BERKELEYJUNG , WOO YONGPOSTDOC-EMPLOYEE
81892122LOS ANGELESSHAPIRO , JORDAN ISAACTECHNICIAN, SCENE
121988359SANTA BARBARACUTLER , CHARLES IANLABORATORY ASST I
\n", - "
" - ], - "text/plain": [ - " ID campus name title\n", - "3 1966846 SAN FRANCISCO MACKEWICZ , CARL E STAFF RESEARCH ASSOC III\n", - "6 1870390 LOS ANGELES ESCUJURI , ERIC JOSEPH TECHNICIAN, SCENE, SR\n", - "7 1771936 BERKELEY JUNG , WOO YONG POSTDOC-EMPLOYEE\n", - "8 1892122 LOS ANGELES SHAPIRO , JORDAN ISAAC TECHNICIAN, SCENE\n", - "12 1988359 SANTA BARBARA CUTLER , CHARLES IAN LABORATORY ASST I" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Look at the updated dataframe\n", "df_ucpay.head()" @@ -1151,7 +391,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:15.702750Z", @@ -1166,195 +406,14 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:16.304034Z", "start_time": "2019-06-30T20:29:16.236810Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AwardIdFirstNameLastNameStartDateEndDateAwardTitleAwardEffectiveDateAwardExpirationDateNameCityNameZipCodePhoneNumberStreetAddressCountryNameStateNameStateCode
17805989AlbertSchwarz2010-08-26NaNApplication of methods of arithmetic geometry ...2010-09-012014-08-31University of California-DavisDavis95618-00005.307548e+09OR/Sponsored ProgramsUnited StatesCaliforniaCA
40820047FransTax2011-03-28NaNArabidopsis 2010: Global Analysis of Translati...2009-03-012013-02-28University of California-RiversideRIVERSIDE92521-10009.518276e+09Office of ResearchUnited StatesCaliforniaCA
49825254PaulDavis2012-01-27NaNCollaborative Research: A 3D Seismic Study of ...2010-01-012013-09-30University of California-Los AngelesLOS ANGELES90095-20003.107940e+0911000 Kinross Avenue, Suite 211United StatesCaliforniaCA
53830228NicholasMelosh2011-08-06NaNNSEC: CENTER FOR PROBING THE NANOSCALE2009-09-012014-08-31Stanford UniversityPalo Alto94304-12126.507232e+093160 Porter DriveUnited StatesCaliforniaCA
58831132JosephPasquale2010-04-12NaNCollaborative Research; CT-M: Computer Systems...2009-03-012012-12-31University of California-San DiegoLa Jolla92093-09348.585345e+09Office of Contract & Grant AdminUnited StatesCaliforniaCA
\n", - "
" - ], - "text/plain": [ - " AwardId FirstName LastName StartDate EndDate \\\n", - "17 805989 Albert Schwarz 2010-08-26 NaN \n", - "40 820047 Frans Tax 2011-03-28 NaN \n", - "49 825254 Paul Davis 2012-01-27 NaN \n", - "53 830228 Nicholas Melosh 2011-08-06 NaN \n", - "58 831132 Joseph Pasquale 2010-04-12 NaN \n", - "\n", - " AwardTitle AwardEffectiveDate \\\n", - "17 Application of methods of arithmetic geometry ... 2010-09-01 \n", - "40 Arabidopsis 2010: Global Analysis of Translati... 2009-03-01 \n", - "49 Collaborative Research: A 3D Seismic Study of ... 2010-01-01 \n", - "53 NSEC: CENTER FOR PROBING THE NANOSCALE 2009-09-01 \n", - "58 Collaborative Research; CT-M: Computer Systems... 2009-03-01 \n", - "\n", - " AwardExpirationDate Name CityName \\\n", - "17 2014-08-31 University of California-Davis Davis \n", - "40 2013-02-28 University of California-Riverside RIVERSIDE \n", - "49 2013-09-30 University of California-Los Angeles LOS ANGELES \n", - "53 2014-08-31 Stanford University Palo Alto \n", - "58 2012-12-31 University of California-San Diego La Jolla \n", - "\n", - " ZipCode PhoneNumber StreetAddress CountryName \\\n", - "17 95618-0000 5.307548e+09 OR/Sponsored Programs United States \n", - "40 92521-1000 9.518276e+09 Office of Research United States \n", - "49 90095-2000 3.107940e+09 11000 Kinross Avenue, Suite 211 United States \n", - "53 94304-1212 6.507232e+09 3160 Porter Drive United States \n", - "58 92093-0934 8.585345e+09 Office of Contract & Grant Admin United States \n", - "\n", - " StateName StateCode \n", - "17 California CA \n", - "40 California CA \n", - "49 California CA \n", - "53 California CA \n", - "58 California CA " - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_nsf_awards.head()" ] @@ -1386,23 +445,14 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:19.359643Z", "start_time": "2019-06-30T20:29:19.349544Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['MACKEWICZ , CARL E' 'ESCUJURI , ERIC JOSEPH' 'JUNG , WOO YONG' ...\n", - " 'BUSUTTIL , RONALD W' 'HOWLAND , BENJAMIN CLARK' 'TEDFORD , JEFF']\n" - ] - } - ], + "outputs": [], "source": [ "# Get all of the values in the \"name\" column in the df_ucpay dataframe \n", "names_ucpay = df_ucpay.name.values\n", @@ -1418,22 +468,14 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:20.633341Z", "start_time": "2019-06-30T20:29:20.624167Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Albert' 'Frans' 'Paul' ... 'Wenguang' 'Laurent' 'Benjamin']\n" - ] - } - ], + "outputs": [], "source": [ "# Get all of the values in the \"FirstName\" column in the df_nsf_awards dataframe\n", "firstnames_nsf_awards = df_nsf_awards.FirstName.values\n", @@ -1442,22 +484,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:21.450620Z", "start_time": "2019-06-30T20:29:21.436951Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['Schwarz' 'Tax' 'Davis' ... 'Sun' 'El Ghaoui' 'Lev']\n" - ] - } - ], + "outputs": [], "source": [ "# Get all of the values in the \"LastName\" column in the df_nsf_awards dataframe\n", "lastnames_nsf_awards = df_nsf_awards.LastName.values\n", @@ -1475,22 +509,14 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:24.330673Z", "start_time": "2019-06-30T20:29:24.314937Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['mackewicz ', ' carl e']\n" - ] - } - ], + "outputs": [], "source": [ "# Take the first name from the UC dataset\n", "test_name = names_ucpay[0]\n", @@ -1513,7 +539,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:27.485413Z", @@ -1569,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:31.060418Z", @@ -1591,7 +617,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:32.443955Z", @@ -1605,7 +631,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:33.114011Z", @@ -1622,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:34.070913Z", @@ -1630,113 +656,7 @@ }, "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDcampusnametitlefirstmiddlelast
31966846SAN FRANCISCOMACKEWICZ , CARL ESTAFF RESEARCH ASSOC IIIcarlemackewicz
61870390LOS ANGELESESCUJURI , ERIC JOSEPHTECHNICIAN, SCENE, SRericjosephescujuri
71771936BERKELEYJUNG , WOO YONGPOSTDOC-EMPLOYEEwooyongjung
81892122LOS ANGELESSHAPIRO , JORDAN ISAACTECHNICIAN, SCENEjordanisaacshapiro
121988359SANTA BARBARACUTLER , CHARLES IANLABORATORY ASST Icharlesiancutler
\n", - "
" - ], - "text/plain": [ - " ID campus name title \\\n", - "3 1966846 SAN FRANCISCO MACKEWICZ , CARL E STAFF RESEARCH ASSOC III \n", - "6 1870390 LOS ANGELES ESCUJURI , ERIC JOSEPH TECHNICIAN, SCENE, SR \n", - "7 1771936 BERKELEY JUNG , WOO YONG POSTDOC-EMPLOYEE \n", - "8 1892122 LOS ANGELES SHAPIRO , JORDAN ISAAC TECHNICIAN, SCENE \n", - "12 1988359 SANTA BARBARA CUTLER , CHARLES IAN LABORATORY ASST I \n", - "\n", - " first middle last \n", - "3 carl e mackewicz \n", - "6 eric joseph escujuri \n", - "7 woo yong jung \n", - "8 jordan isaac shapiro \n", - "12 charles ian cutler " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_ucpay.head()" ] @@ -1752,7 +672,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:36.893652Z", @@ -1773,7 +693,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:37.959183Z", @@ -1797,7 +717,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:29:41.392713Z", @@ -1805,200 +725,7 @@ }, "scrolled": false }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AwardIdFirstNameLastNameStartDateEndDateAwardTitleAwardEffectiveDateAwardExpirationDateNameCityNameZipCodePhoneNumberStreetAddressCountryNameStateNameStateCodefirstlast
17805989AlbertSchwarz2010-08-26NaNApplication of methods of arithmetic geometry ...2010-09-012014-08-31University of California-DavisDavis95618-00005.307548e+09OR/Sponsored ProgramsUnited StatesCaliforniaCAalbertschwarz
40820047FransTax2011-03-28NaNArabidopsis 2010: Global Analysis of Translati...2009-03-012013-02-28University of California-RiversideRIVERSIDE92521-10009.518276e+09Office of ResearchUnited StatesCaliforniaCAfranstax
49825254PaulDavis2012-01-27NaNCollaborative Research: A 3D Seismic Study of ...2010-01-012013-09-30University of California-Los AngelesLOS ANGELES90095-20003.107940e+0911000 Kinross Avenue, Suite 211United StatesCaliforniaCApauldavis
53830228NicholasMelosh2011-08-06NaNNSEC: CENTER FOR PROBING THE NANOSCALE2009-09-012014-08-31Stanford UniversityPalo Alto94304-12126.507232e+093160 Porter DriveUnited StatesCaliforniaCAnicholasmelosh
58831132JosephPasquale2010-04-12NaNCollaborative Research; CT-M: Computer Systems...2009-03-012012-12-31University of California-San DiegoLa Jolla92093-09348.585345e+09Office of Contract & Grant AdminUnited StatesCaliforniaCAjosephpasquale
\n", - "
" - ], - "text/plain": [ - " AwardId FirstName LastName StartDate EndDate \\\n", - "17 805989 Albert Schwarz 2010-08-26 NaN \n", - "40 820047 Frans Tax 2011-03-28 NaN \n", - "49 825254 Paul Davis 2012-01-27 NaN \n", - "53 830228 Nicholas Melosh 2011-08-06 NaN \n", - "58 831132 Joseph Pasquale 2010-04-12 NaN \n", - "\n", - " AwardTitle AwardEffectiveDate \\\n", - "17 Application of methods of arithmetic geometry ... 2010-09-01 \n", - "40 Arabidopsis 2010: Global Analysis of Translati... 2009-03-01 \n", - "49 Collaborative Research: A 3D Seismic Study of ... 2010-01-01 \n", - "53 NSEC: CENTER FOR PROBING THE NANOSCALE 2009-09-01 \n", - "58 Collaborative Research; CT-M: Computer Systems... 2009-03-01 \n", - "\n", - " AwardExpirationDate Name CityName \\\n", - "17 2014-08-31 University of California-Davis Davis \n", - "40 2013-02-28 University of California-Riverside RIVERSIDE \n", - "49 2013-09-30 University of California-Los Angeles LOS ANGELES \n", - "53 2014-08-31 Stanford University Palo Alto \n", - "58 2012-12-31 University of California-San Diego La Jolla \n", - "\n", - " ZipCode PhoneNumber StreetAddress CountryName \\\n", - "17 95618-0000 5.307548e+09 OR/Sponsored Programs United States \n", - "40 92521-1000 9.518276e+09 Office of Research United States \n", - "49 90095-2000 3.107940e+09 11000 Kinross Avenue, Suite 211 United States \n", - "53 94304-1212 6.507232e+09 3160 Porter Drive United States \n", - "58 92093-0934 8.585345e+09 Office of Contract & Grant Admin United States \n", - "\n", - " StateName StateCode first last \n", - "17 California CA albert schwarz \n", - "40 California CA frans tax \n", - "49 California CA paul davis \n", - "53 California CA nicholas melosh \n", - "58 California CA joseph pasquale " - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "df_nsf_awards.head()" ] @@ -2018,7 +745,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:31:55.297567Z", @@ -2054,7 +781,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:31:56.414063Z", @@ -2072,25 +799,14 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:31:56.831857Z", "start_time": "2019-06-30T20:31:56.819260Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "'carl'" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Comparison of records\n", "testname = unicode(uc_firstnames[0])\n", @@ -2099,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:36:01.976639Z", @@ -2134,68 +850,28 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:32:31.795435Z", "start_time": "2019-06-30T20:32:31.349966Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "carl ['carl', 'carlo', 'carli', 'carly', 'carla', 'carol', 'caryl', 'carlos', 'carlie', 'carlee']\n" - ] - } - ], + "outputs": [], "source": [ "print(testname, get_matching_first_name(testname, uc_firstnames))" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:32:43.569250Z", "start_time": "2019-06-30T20:32:32.044145Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1 carl ['carl', 'carlo', 'carli', 'carly', 'carla', 'carol', 'caryl', 'carlos', 'carlie', 'carlee']\n", - "2 eric ['eric', 'erick', 'erice', 'erich', 'erica', 'erric', 'edric', 'enric', 'ericka', 'ericha']\n", - "3 woo ['woo', 'wood', 'woon', 'wook', 'wo', 'woody', 'woori', 'elwood', 'witoon', 'woojong']\n", - "4 jordan ['jordan', 'jordana', 'jordane', 'jourdan', 'jordin', 'jordon', 'jordaniel', 'jodean', 'joan', 'joraine']\n", - "5 charles ['charles', 'charless', 'charley', 'charese', 'charlee', 'charlie', 'charleston', 'charline', 'charlane', 'charleen']\n", - "6 mark ['mark', 'marko', 'marka', 'marek', 'markos', 'markee', 'markle', 'markim', 'markus', 'marika']\n", - "7 dev ['dev', 'devi', 'deva', 'de', 'devra', 'devie', 'devyn', 'devan', 'devin', 'devon']\n", - "8 jamie ['jamie', 'jamiel', 'jammie', 'jami', 'jaymie', 'jaime', 'jamaine', 'amie', 'jasmine', 'jazmine']\n", - "9 yalei ['yalei', 'yale', 'yanlei', 'yali', 'yaneli', 'yawei', 'yafei', 'yanli', 'yael', 'maylei']\n", - "10 steven ['steven', 'steve', 'steaven', 'stevie', 'stevon', 'stevan', 'sten', 'steven-huy', 'seveen', 'seve']\n", - "11 gregory ['gregory', 'gregor', 'grigory', 'gregorij', 'gregorio', 'gregoria', 'gregoire', 'greg', 'grzegorz', 'grigoriy']\n", - "12 noe ['noe', 'noel', 'nomer', 'noemi', 'noree', 'noemy', 'noeun', 'nokteh', 'noelle', 'noriel']\n", - "13 brooke ['brooke', 'brookie', 'brook', 'brooks', 'booker', 'brooklyn', 'brooklynn', 'brock', 'burke', 'broc']\n", - "14 elena ['elena', 'elenna', 'elen', 'ellena', 'elna', 'helena', 'jelena', 'celena', 'yelena', 'selena']\n", - "15 ken ['ken', 'keen', 'kien', 'kean', 'koen', 'kent', 'keni', 'keun', 'keon', 'ke']\n", - "16 kevin ['kevin', 'kelvin', 'evin', 'kevis', 'keven', 'kevan', 'r.kevin', 'kevin-paul', 'keqin', 'kexin']\n", - "17 victoria ['victoria', 'victorina', 'victori', 'vicktoria', 'victoriana', 'victoriano', 'victorio', 'victor', 'victorya', 'viktoria']\n", - "18 sarah ['sarah', 'sarath', 'sarahy', 'sara', 'sarahann', 'samrrah', 'saurabh', 'safirah', 'saras', 'sarai']\n", - "19 kaitlin ['kaitlin', 'kailin', 'kaitlyn', 'kaili', 'katalin', 'katelin', 'kaetlin', 'katlin', 'caitlin', 'kaiqin']\n", - "20 sabrina ['sabrina', 'sabrin', 'sabina', 'sarina', 'sabra', 'sabin', 'sarbani', 'sarna', 'sharina', 'sebrina']\n", - "21 michele ['michele', 'michelle', 'michaele', 'michel', \"miche-le'\", 'micheline', 'micheal', 'michela', 'michiel', 'michael']\n", - "22 fernando ['fernando', 'fernand', 'fernan', 'fernanda', 'hernando', 'fern', 'fenno', 'ferdinando', 'ferdnand', 'ferda']\n", - "23 michelle ['michelle', 'michele', 'michell', 'mi-chelle', 'mitchelle', 'mischelle', 'michellene', 'michel', 'michaele', 'marichelle']\n", - "24 adi ['adi', 'ardi', 'adib', 'hadi', 'fadi', 'gadi', 'adil', 'adia', 'fadil', 'fadia']\n", - "25 cindy ['cindy', 'cinda', 'cindi', 'cindra', 'cindee', 'cindy-heung', 'cyndy', 'candy', 'cendy', 'lindy']\n" - ] - } - ], + "outputs": [], "source": [ "for i, nm in enumerate(uc_firstnames[:25]):\n", " testname = unicode(nm)\n", @@ -2221,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:32:48.377777Z", @@ -2235,211 +911,21 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:32:49.052801Z", "start_time": "2019-06-30T20:32:49.009830Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "{17: {'AwardEffectiveDate': '2010-09-01',\n", - " 'AwardExpirationDate': '2014-08-31',\n", - " 'AwardId': 805989,\n", - " 'AwardTitle': 'Application of methods of arithmetic geometry and homological algebra to quantum field theory and string theory',\n", - " 'CityName': 'Davis',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Albert',\n", - " 'LastName': 'Schwarz',\n", - " 'Name': 'University of California-Davis',\n", - " 'PhoneNumber': 5307547700.0,\n", - " 'StartDate': '2010-08-26',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': 'OR/Sponsored Programs',\n", - " 'ZipCode': '95618-0000',\n", - " 'first': 'albert',\n", - " 'last': 'schwarz'},\n", - " 40: {'AwardEffectiveDate': '2009-03-01',\n", - " 'AwardExpirationDate': '2013-02-28',\n", - " 'AwardId': 820047,\n", - " 'AwardTitle': 'Arabidopsis 2010: Global Analysis of Translational Regulons',\n", - " 'CityName': 'RIVERSIDE',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Frans',\n", - " 'LastName': 'Tax',\n", - " 'Name': 'University of California-Riverside',\n", - " 'PhoneNumber': 9518275535.0,\n", - " 'StartDate': '2011-03-28',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': 'Office of Research',\n", - " 'ZipCode': '92521-1000',\n", - " 'first': 'frans',\n", - " 'last': 'tax'},\n", - " 49: {'AwardEffectiveDate': '2010-01-01',\n", - " 'AwardExpirationDate': '2013-09-30',\n", - " 'AwardId': 825254,\n", - " 'AwardTitle': 'Collaborative Research: A 3D Seismic Study of the Pacific-North American Plate Boundary in Southern California',\n", - " 'CityName': 'LOS ANGELES',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Paul',\n", - " 'LastName': 'Davis',\n", - " 'Name': 'University of California-Los Angeles',\n", - " 'PhoneNumber': 3107940102.0,\n", - " 'StartDate': '2012-01-27',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': '11000 Kinross Avenue, Suite 211',\n", - " 'ZipCode': '90095-2000',\n", - " 'first': 'paul',\n", - " 'last': 'davis'},\n", - " 53: {'AwardEffectiveDate': '2009-09-01',\n", - " 'AwardExpirationDate': '2014-08-31',\n", - " 'AwardId': 830228,\n", - " 'AwardTitle': 'NSEC: CENTER FOR PROBING THE NANOSCALE',\n", - " 'CityName': 'Palo Alto',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Nicholas',\n", - " 'LastName': 'Melosh',\n", - " 'Name': 'Stanford University',\n", - " 'PhoneNumber': 6507232300.0,\n", - " 'StartDate': '2011-08-06',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': '3160 Porter Drive',\n", - " 'ZipCode': '94304-1212',\n", - " 'first': 'nicholas',\n", - " 'last': 'melosh'},\n", - " 58: {'AwardEffectiveDate': '2009-03-01',\n", - " 'AwardExpirationDate': '2012-12-31',\n", - " 'AwardId': 831132,\n", - " 'AwardTitle': 'Collaborative Research; CT-M: Computer Systems Vulnerabilities and the Efficacy of Defensive Mechanisms',\n", - " 'CityName': 'La Jolla',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Joseph',\n", - " 'LastName': 'Pasquale',\n", - " 'Name': 'University of California-San Diego',\n", - " 'PhoneNumber': 8585344896.0,\n", - " 'StartDate': '2010-04-12',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': 'Office of Contract & Grant Admin',\n", - " 'ZipCode': '92093-0934',\n", - " 'first': 'joseph',\n", - " 'last': 'pasquale'},\n", - " 69: {'AwardEffectiveDate': '2009-09-01',\n", - " 'AwardExpirationDate': '2014-08-31',\n", - " 'AwardId': 832819,\n", - " 'AwardTitle': 'The Center of Integrated Nanomechanical Systems (COINS) Renewal Yrs 6-10',\n", - " 'CityName': 'BERKELEY',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Ronald',\n", - " 'LastName': 'Fearing',\n", - " 'Name': 'University of California-Berkeley',\n", - " 'PhoneNumber': 5106428109.0,\n", - " 'StartDate': '2010-02-25',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': 'Sponsored Projects Office',\n", - " 'ZipCode': '94704-5940',\n", - " 'first': 'ronald',\n", - " 'last': 'fearing'},\n", - " 73: {'AwardEffectiveDate': '2009-01-01',\n", - " 'AwardExpirationDate': '2014-12-31',\n", - " 'AwardId': 833340,\n", - " 'AwardTitle': 'Robert Noyce Teacher Scholars Program',\n", - " 'CityName': 'SANTA CRUZ',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Gretchen',\n", - " 'LastName': 'Andreasen',\n", - " 'Name': 'University of California-Santa Cruz',\n", - " 'PhoneNumber': 8314595278.0,\n", - " 'StartDate': '2011-07-19',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': '1156 High Street',\n", - " 'ZipCode': '95064-1077',\n", - " 'first': 'gretchen',\n", - " 'last': 'andreasen'},\n", - " 103: {'AwardEffectiveDate': '2009-01-01',\n", - " 'AwardExpirationDate': '2014-09-30',\n", - " 'AwardId': 836152,\n", - " 'AwardTitle': 'The Sondrestrom Upper Atmospheric Research Facility: Research, Operation and Coordination',\n", - " 'CityName': 'MENLO PARK',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': '2013-05-30',\n", - " 'FirstName': 'Craig',\n", - " 'LastName': 'Heinselman',\n", - " 'Name': 'SRI International',\n", - " 'PhoneNumber': 6508592651.0,\n", - " 'StartDate': '2012-12-20',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': '333 RAVENSWOOD AVE',\n", - " 'ZipCode': '94025-3493',\n", - " 'first': 'craig',\n", - " 'last': 'heinselman'},\n", - " 104: {'AwardEffectiveDate': '2009-01-01',\n", - " 'AwardExpirationDate': '2014-09-30',\n", - " 'AwardId': 836152,\n", - " 'AwardTitle': 'The Sondrestrom Upper Atmospheric Research Facility: Research, Operation and Coordination',\n", - " 'CityName': 'MENLO PARK',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': '2013-05-30',\n", - " 'FirstName': 'Mary',\n", - " 'LastName': 'McCready',\n", - " 'Name': 'SRI International',\n", - " 'PhoneNumber': 6508592651.0,\n", - " 'StartDate': '2012-02-10',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': '333 RAVENSWOOD AVE',\n", - " 'ZipCode': '94025-3493',\n", - " 'first': 'mary',\n", - " 'last': 'mccready'},\n", - " 131: {'AwardEffectiveDate': '2009-07-01',\n", - " 'AwardExpirationDate': '2013-06-30',\n", - " 'AwardId': 838258,\n", - " 'AwardTitle': 'Collaborative Research: Astronomy with CARMA',\n", - " 'CityName': 'BERKELEY',\n", - " 'CountryName': 'United States',\n", - " 'EndDate': nan,\n", - " 'FirstName': 'Leo',\n", - " 'LastName': 'Blitz',\n", - " 'Name': 'University of California-Berkeley',\n", - " 'PhoneNumber': 5106428109.0,\n", - " 'StartDate': '2011-03-07',\n", - " 'StateCode': 'CA',\n", - " 'StateName': 'California',\n", - " 'StreetAddress': 'Sponsored Projects Office',\n", - " 'ZipCode': '94704-5940',\n", - " 'first': 'leo',\n", - " 'last': 'blitz'}}" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "dict_nsf_awards" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:36:17.939991Z", @@ -2487,7 +973,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:36:19.185824Z", @@ -2541,266 +1027,28 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:36:28.469466Z", "start_time": "2019-06-30T20:36:20.301998Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No Match: nicholas melosh\n", - "No Match: frans tax\n", - "No Match: craig heinselman\n", - "No Match: mary mccready\n" - ] - } - ], + "outputs": [], "source": [ "df_linked_data = match_records(dict_nsf_awards, df_ucpay, create_rule_mask )" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T20:36:28.556271Z", "start_time": "2019-06-30T20:36:28.474144Z" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AwardIdCityNameFirstNameIDLastNameNamecampustitlefirstlast
0805989.0DavisAlbert1814365.0SchwarzUniversity of California-DavisDAVISPROFESSOR - ACADEMIC YEARalbertschwarz
1838258.0BERKELEYLeo1765982.0BlitzUniversity of California-BerkeleyBERKELEYPROFESSOR - ACADEMIC YEARleoblitz
2825254.0LOS ANGELESPaul1832316.0DavisUniversity of California-Los AngelesIRVINEANALYST, ADMINISTRATIVE, ASSTpauldavis
3825254.0LOS ANGELESPaul1832352.0DavisUniversity of California-Los AngelesIRVINEHOSPITAL UNIT SERV COORD IIpauldavis
4825254.0LOS ANGELESPaul1800777.0DavisUniversity of California-Los AngelesDAVISRESEARCH _____ - FISCAL YEARpauldavis
5825254.0LOS ANGELESPaul1868502.0DavisUniversity of California-Los AngelesLOS ANGELESPROGRAMMER/ANALYST IV - SUPERVpauldavis
6825254.0LOS ANGELESPaul1868593.0DavisUniversity of California-Los AngelesLOS ANGELESPROFESSOR - ACADEMIC YEARpauldavis
7833340.0SANTA CRUZGretchen1999814.0AndreasenUniversity of California-Santa CruzSANTA CRUZADMIN/COORD/OFFICER(FUNC AREA)gretchenandreasen
8831132.0La JollaJoseph1887173.0PasqualeUniversity of California-San DiegoLOS ANGELESREPRESENTATIVE, ACCESS, PRINjosephpasquale
9831132.0La JollaJoseph1942864.0PasqualeUniversity of California-San DiegoSAN DIEGOPROFESSOR-ACAD YR-BUS/ECON/ENGjosephpasquale
10832819.0BERKELEYRonald1768965.0FearingUniversity of California-BerkeleyBERKELEYPROFESSOR-ACAD YR-BUS/ECON/ENGronaldfearing
\n", - "
" - ], - "text/plain": [ - " AwardId CityName FirstName ID LastName \\\n", - "0 805989.0 Davis Albert 1814365.0 Schwarz \n", - "1 838258.0 BERKELEY Leo 1765982.0 Blitz \n", - "2 825254.0 LOS ANGELES Paul 1832316.0 Davis \n", - "3 825254.0 LOS ANGELES Paul 1832352.0 Davis \n", - "4 825254.0 LOS ANGELES Paul 1800777.0 Davis \n", - "5 825254.0 LOS ANGELES Paul 1868502.0 Davis \n", - "6 825254.0 LOS ANGELES Paul 1868593.0 Davis \n", - "7 833340.0 SANTA CRUZ Gretchen 1999814.0 Andreasen \n", - "8 831132.0 La Jolla Joseph 1887173.0 Pasquale \n", - "9 831132.0 La Jolla Joseph 1942864.0 Pasquale \n", - "10 832819.0 BERKELEY Ronald 1768965.0 Fearing \n", - "\n", - " Name campus \\\n", - "0 University of California-Davis DAVIS \n", - "1 University of California-Berkeley BERKELEY \n", - "2 University of California-Los Angeles IRVINE \n", - "3 University of California-Los Angeles IRVINE \n", - "4 University of California-Los Angeles DAVIS \n", - "5 University of California-Los Angeles LOS ANGELES \n", - "6 University of California-Los Angeles LOS ANGELES \n", - "7 University of California-Santa Cruz SANTA CRUZ \n", - "8 University of California-San Diego LOS ANGELES \n", - "9 University of California-San Diego SAN DIEGO \n", - "10 University of California-Berkeley BERKELEY \n", - "\n", - " title first last \n", - "0 PROFESSOR - ACADEMIC YEAR albert schwarz \n", - "1 PROFESSOR - ACADEMIC YEAR leo blitz \n", - "2 ANALYST, ADMINISTRATIVE, ASST paul davis \n", - "3 HOSPITAL UNIT SERV COORD II paul davis \n", - "4 RESEARCH _____ - FISCAL YEAR paul davis \n", - "5 PROGRAMMER/ANALYST IV - SUPERV paul davis \n", - "6 PROFESSOR - ACADEMIC YEAR paul davis \n", - "7 ADMIN/COORD/OFFICER(FUNC AREA) gretchen andreasen \n", - "8 REPRESENTATIVE, ACCESS, PRIN joseph pasquale \n", - "9 PROFESSOR-ACAD YR-BUS/ECON/ENG joseph pasquale \n", - "10 PROFESSOR-ACAD YR-BUS/ECON/ENG ronald fearing " - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "selected_columns = ['AwardId', 'CityName', 'FirstName', 'ID', 'LastName', 'Name', 'campus', 'title', 'first', 'last']\n", "df_linked_data[selected_columns]" @@ -2846,7 +1094,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:00:59.871185Z", @@ -2969,7 +1217,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:01:00.290652Z", @@ -2983,68 +1231,42 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:01:00.790494Z", "start_time": "2019-06-30T21:01:00.783007Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "True\n" - ] - } - ], + "outputs": [], "source": [ "print( fs.link_record(('Avishek','Kumar'), ('Avishek','Kumar')) )" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:01:01.352939Z", "start_time": "2019-06-30T21:01:01.342885Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "False\n" - ] - } - ], + "outputs": [], "source": [ "print( fs.link_record( ('Avishek','Kumar'), ('Anup','Kumar') ) )" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:01:04.770898Z", "start_time": "2019-06-30T21:01:04.756602Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "jonathon jonthon 2\n", - "john mark 0\n", - "fred frederick 1\n" - ] - } - ], + "outputs": [], "source": [ "#let's take this new function for a spin\n", "print('jonathon', 'jonthon', fs.fuzzy_match('jonathon','jonthon') )\n", @@ -3054,7 +1276,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:06:27.846712Z", @@ -3093,31 +1315,21 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:07:02.333067Z", "start_time": "2019-06-30T21:06:40.122671Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "No Match: nicholas melosh\n", - "No Match: frans tax\n", - "No Match: craig heinselman\n" - ] - } - ], + "outputs": [], "source": [ "df_linked_data = match_records(dict_nsf_awards, df_ucpay, create_jaro_mask )" ] }, { "cell_type": "code", - "execution_count": 81, + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-06-30T21:07:17.414495Z", @@ -3125,282 +1337,7 @@ }, "scrolled": true }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
AwardIdCityNameFirstNameIDLastNameNamecampustitlefirstlast
0805989.0DavisAlbert1814365.0SchwarzUniversity of California-DavisDAVISPROFESSOR - ACADEMIC YEARalbertschwarz
1838258.0BERKELEYLeo1765982.0BlitzUniversity of California-BerkeleyBERKELEYPROFESSOR - ACADEMIC YEARleoblitz
2825254.0LOS ANGELESPaul1832316.0DavisUniversity of California-Los AngelesIRVINEANALYST, ADMINISTRATIVE, ASSTpauldavis
3825254.0LOS ANGELESPaul1832352.0DavisUniversity of California-Los AngelesIRVINEHOSPITAL UNIT SERV COORD IIpauldavis
4825254.0LOS ANGELESPaul1800777.0DavisUniversity of California-Los AngelesDAVISRESEARCH _____ - FISCAL YEARpauldavis
5825254.0LOS ANGELESPaul1868502.0DavisUniversity of California-Los AngelesLOS ANGELESPROGRAMMER/ANALYST IV - SUPERVpauldavis
6825254.0LOS ANGELESPaul1868593.0DavisUniversity of California-Los AngelesLOS ANGELESPROFESSOR - ACADEMIC YEARpauldavis
7833340.0SANTA CRUZGretchen1999814.0AndreasenUniversity of California-Santa CruzSANTA CRUZADMIN/COORD/OFFICER(FUNC AREA)gretchenandreasen
8831132.0La JollaJoseph1887173.0PasqualeUniversity of California-San DiegoLOS ANGELESREPRESENTATIVE, ACCESS, PRINjosephpasquale
9831132.0La JollaJoseph1942864.0PasqualeUniversity of California-San DiegoSAN DIEGOPROFESSOR-ACAD YR-BUS/ECON/ENGjosephpasquale
10836152.0MENLO PARKMary1774629.0McCreadySRI InternationalBERKELEYACADEMIC COORD II-ACADEMIC YRmarymccready
11836152.0MENLO PARKMary1837591.0McCreadySRI InternationalIRVINESENIOR FINANCE ANALYSTmarymccready
12836152.0MENLO PARKMary1944081.0McCreadySRI InternationalSAN DIEGOMANAGER, CASEmarymccready
13832819.0BERKELEYRonald1768965.0FearingUniversity of California-BerkeleyBERKELEYPROFESSOR-ACAD YR-BUS/ECON/ENGronaldfearing
\n", - "
" - ], - "text/plain": [ - " AwardId CityName FirstName ID LastName \\\n", - "0 805989.0 Davis Albert 1814365.0 Schwarz \n", - "1 838258.0 BERKELEY Leo 1765982.0 Blitz \n", - "2 825254.0 LOS ANGELES Paul 1832316.0 Davis \n", - "3 825254.0 LOS ANGELES Paul 1832352.0 Davis \n", - "4 825254.0 LOS ANGELES Paul 1800777.0 Davis \n", - "5 825254.0 LOS ANGELES Paul 1868502.0 Davis \n", - "6 825254.0 LOS ANGELES Paul 1868593.0 Davis \n", - "7 833340.0 SANTA CRUZ Gretchen 1999814.0 Andreasen \n", - "8 831132.0 La Jolla Joseph 1887173.0 Pasquale \n", - "9 831132.0 La Jolla Joseph 1942864.0 Pasquale \n", - "10 836152.0 MENLO PARK Mary 1774629.0 McCready \n", - "11 836152.0 MENLO PARK Mary 1837591.0 McCready \n", - "12 836152.0 MENLO PARK Mary 1944081.0 McCready \n", - "13 832819.0 BERKELEY Ronald 1768965.0 Fearing \n", - "\n", - " Name campus \\\n", - "0 University of California-Davis DAVIS \n", - "1 University of California-Berkeley BERKELEY \n", - "2 University of California-Los Angeles IRVINE \n", - "3 University of California-Los Angeles IRVINE \n", - "4 University of California-Los Angeles DAVIS \n", - "5 University of California-Los Angeles LOS ANGELES \n", - "6 University of California-Los Angeles LOS ANGELES \n", - "7 University of California-Santa Cruz SANTA CRUZ \n", - "8 University of California-San Diego LOS ANGELES \n", - "9 University of California-San Diego SAN DIEGO \n", - "10 SRI International BERKELEY \n", - "11 SRI International IRVINE \n", - "12 SRI International SAN DIEGO \n", - "13 University of California-Berkeley BERKELEY \n", - "\n", - " title first last \n", - "0 PROFESSOR - ACADEMIC YEAR albert schwarz \n", - "1 PROFESSOR - ACADEMIC YEAR leo blitz \n", - "2 ANALYST, ADMINISTRATIVE, ASST paul davis \n", - "3 HOSPITAL UNIT SERV COORD II paul davis \n", - "4 RESEARCH _____ - FISCAL YEAR paul davis \n", - "5 PROGRAMMER/ANALYST IV - SUPERV paul davis \n", - "6 PROFESSOR - ACADEMIC YEAR paul davis \n", - "7 ADMIN/COORD/OFFICER(FUNC AREA) gretchen andreasen \n", - "8 REPRESENTATIVE, ACCESS, PRIN joseph pasquale \n", - "9 PROFESSOR-ACAD YR-BUS/ECON/ENG joseph pasquale \n", - "10 ACADEMIC COORD II-ACADEMIC YR mary mccready \n", - "11 SENIOR FINANCE ANALYST mary mccready \n", - "12 MANAGER, CASE mary mccready \n", - "13 PROFESSOR-ACAD YR-BUS/ECON/ENG ronald fearing " - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "sel_col = ['AwardId', 'CityName', 'FirstName', 'ID', 'LastName', 'Name', 'campus', 'title', 'first', 'last']\n", "df_linked_data[sel_col]"