General Practitioners in parisian departements

[1]:
from pynsee.sirene import get_dimension_list, search_sirene
from pynsee.metadata import get_activity_list
from pynsee.geodata import get_geodata_list, get_geodata
from pynsee.localdata import get_geo_list

import difflib
import geopandas as gpd
import numpy
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import re
from shapely.geometry import Polygon, MultiPolygon
from matplotlib.offsetbox import AnchoredText
[2]:
import logging
import sys
logging.basicConfig(stream=sys.stdout,
                    level=logging.INFO,
                    format="%(message)s")
[4]:
get_dimension_list().to_csv("sirene_dimensions.csv")
Existing environment variables used, instead of locally saved credentials
1 - Getting data: 100%|██████████| 1/1 [00:00<00:00, 102.94it/s]
Data saved: /home/onyxia/.cache/pynsee/pynsee/76f10aaaadf5a625061805b71f0bf0db
This function renders package's internal data
This function may return personal data, please check and comply with the legal framework relating to personal data protection !

[5]:
# get activity list
naf5 = get_activity_list('NAF5')
naf5.to_csv("naf.csv")
naf5[naf5["NAF5"].str.contains("^86.2")]
[5]:
A10 A129 A17 A21 A38 A5 A64 A88 NAF1 NAF2 NAF3 NAF4 NAF5 TITLE_NAF5_40CH_FR TITLE_NAF5_65CH_FR TITLE_NAF5_FR
664 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.21 86.21Z Activité des médecins généralistes Activité des médecins généralistes Activité des médecins généralistes
665 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.22 86.22A Act. radiodiagnostic et de radiothérapie Activités de radiodiagnostic et de radiothérapie Activités de radiodiagnostic et de radiothérapie
666 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.22 86.22B Activités chirurgicales Activités chirurgicales Activités chirurgicales
667 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.22 86.22C Autre activité des médecins spécialistes Autres activités des médecins spécialistes Autres activités des médecins spécialistes
668 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.23 86.23Z Pratique dentaire Pratique dentaire Pratique dentaire
[6]:
# departements list of ile de france
list_dep = ["92", "93", "75", "94"] # + ["77", "78", "95", "91"]
string_dep = "|".join([dep + "*" for dep in list_dep])
string_dep
[6]:
'92*|93*|75*|94*'
[7]:
# search data on doctors

doctors = search_sirene(variable = ["activitePrincipaleEtablissement", "codePostalEtablissement"],
                        pattern = ['86.21Z*', string_dep],
                        number = 100000)
 # '86.21Z' médecin généraliste

doctors = doctors[doctors["activitePrincipaleEtablissement"].str.contains("^86.21Z")].reset_index(drop=True)
1 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 212.20it/s]
2/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 224.17it/s]
3/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 207.62it/s]
4/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 217.40it/s]
5/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 225.08it/s]
6/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 222.84it/s]
7/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 234.79it/s]
8/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 204.70it/s]
9/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 216.98it/s]
10/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 220.54it/s]
11/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 220.24it/s]
12/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 232.69it/s]
13/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 236.86it/s]
14/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 230.69it/s]
15/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 234.82it/s]
16/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 240.52it/s]
17/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 238.77it/s]
18/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 241.84it/s]
19/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 226.83it/s]
20/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 222.08it/s]
21/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 238.45it/s]
22/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 241.57it/s]
23/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 245.45it/s]
24/100 - Getting data: 100%|██████████| 1000/1000 [00:03<00:00, 252.45it/s]
25/100 - Getting data: 100%|██████████| 1000/1000 [00:04<00:00, 237.15it/s]
26/100 - Getting data: 100%|██████████| 1000/1000 [00:03<00:00, 261.68it/s]
27/100 - Getting data: 100%|██████████| 1000/1000 [00:03<00:00, 258.79it/s]
28/100 - Getting data: 100%|██████████| 1000/1000 [00:03<00:00, 257.36it/s]
29/100 - Getting data: 100%|██████████| 175/175 [00:00<00:00, 255.57it/s]
Data saved: /home/onyxia/.cache/pynsee/pynsee/6c0f248de2346ddc6cb0d424b41d8f40
[8]:
# filter
list_pattern = ["RADIOLOGIE", "IMAGERIE", "GYNECO", "DENTAIRE", "ANESTHES", 'P.M.I', 'INFANTILE', 'CARDIOLOG',
                "MAXILLO", "HOSPITALIER", "OPHTALMO", "DERMATOLOG", 'PLANIFICATION', 'MAIRIE', 'THORACIQUE',
                "TOMODENSITOM", "URGENCE", "NEPHROLOG", "PRAXEA", 'VACCINATION', 'SCOLAIRE', 'CANCER', 'STOMATOLO',
                "URODYNAMIQ", "CARDIORIS", "CHIRPLASTIQUE", "NINAMAX", "CICOF", 'MATERNELLE', 'DIALYSE',
                'RADIOTHERAPIE', 'RHUMATOLOG', 'DENTISTE', 'ECHOGRAPH', 'PATHOLOG', 'CHIRURGIE', 'ESTHETIQ',
                'CYTOLOG', 'ORTHOPED']

pattern = r"|".join(list_pattern)
[9]:
# clean up the data not related to general practitioners but to other health activities
doctors = doctors[~doctors["denominationUniteLegale"].str.contains(pattern, regex=True, na=False)].reset_index(drop=True)
doctors = doctors[~doctors["enseigne1Etablissement"].str.contains(pattern, regex=True, na=False)].reset_index(drop=True)

# exclude association and administration
doctors = doctors[~doctors["categorieJuridiqueUniteLegale"].isin(["9220", "7220"])].reset_index(drop=True)
doctors = doctors.sort_values(["categorieJuridiqueUniteLegale"])

doctors.to_csv("doctors.csv")
list_denomination_uniteLegale = doctors.denominationUniteLegale.unique()
[10]:
# get geographical data list
geodata_list = get_geodata_list()

# get communes geographical limits
com = get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:commune')

# get arrondissement (subdivision of departement) geographical limits
arr = get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:arrondissement')
Previously saved data has been used:
/home/onyxia/.cache/pynsee/pynsee/2ac583be54e866b2b1b30fb0113c1fd0
Set update=True to get the most up-to-date data
100%|██████████| 99/99 [00:33<00:00,  2.97it/s]
Data saved: /home/onyxia/.cache/pynsee/pynsee/a4f589755961ab3aa6bdff99605334d1
Data saved: /home/onyxia/.cache/pynsee/pynsee/1b27559c061e4bdaa0bd2966f4605287
[11]:
# get arrondissement municipal geographical limits
arrmun = get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:arrondissement_municipal')
arrmun = arrmun[arrmun.insee_com.str.contains('^75')].reset_index(drop=True)
arrmun["nom_m"] = arrmun["nom_m"].apply(lambda x: re.sub(r"ER|E", "", re.sub(" ARRONDISSEMENT", "", x)))
arrmun = arrmun.rename(columns={"geometry" : "geometry2",
                                "population" : "population2",
                               "nom_m" : "nom_m2"})
arrmun = arrmun[["nom_m2", "geometry2", "population2"]]
Data saved: /home/onyxia/.cache/pynsee/pynsee/94d0586636b45c9d8cdd466da4a3f3b8
[12]:
def _replace_nan(x, y):
    if (x is None) or (numpy.isnan(x)):
        return y
    else:
        return x
[13]:
doctors
[13]:
siren nic siret dateDebut dateCreationEtablissement dateCreationUniteLegale dateFin denominationUniteLegale nomUniteLegale prenomUsuelUniteLegale ... libellePaysEtrangerEtablissement complementAdresse2Etablissement indiceRepetition2Etablissement libelleCommuneEtranger2Etablissement distributionSpeciale2Etablissement codeCedex2Etablissement libelleCedex2Etablissement codePaysEtranger2Etablissement libellePaysEtranger2Etablissement enseigne3Etablissement
5267 445161631 00038 44516163100038 2014-11-01 2014-11-01 2002-12-31 None None BACH DANIEL ... None None None None None None None None None None
6477 518143185 00016 51814318500016 2009-11-02 2009-11-02 2009-11-02 None None RICCI AXEL ... None None None None None None None None None None
6475 518032172 00026 51803217200026 2013-06-01 2013-06-01 2009-05-02 None None CHENIOUR SOUMAYA ... None None None None None None None None None None
6474 518023916 00027 51802391600027 2012-01-02 2012-01-02 2009-10-31 None None BIBAS AURORE ... None None None None None None None None None None
6473 518001169 00029 51800116900029 2015-12-02 2015-12-02 2009-06-05 None None SIDHOUM ANNAB SAMIRA ... None None None None None None None None None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6916 775665805 00468 77566580500468 2008-01-01 1996-12-25 1900-01-01 None COMPAGNIE FILLES CHARITE ST VINCENT PAUL None None ... None None None None None None None None None None
6928 775683006 00362 77568300600362 2017-06-21 2017-06-21 1900-01-01 None FONDATION SANTE DES ETUDIANTS DE FRANCE None None ... None None None None None None None None None None
6915 775663883 00020 77566388300020 2008-01-01 1900-01-01 1900-01-01 None INSTITUT ARTHUR VERNES None None ... None None None None None None None None None None
292 313711038 00015 31371103800015 2008-01-01 1978-01-01 1978-01-01 None FONDATION HAHNEMANN None None ... None None None None None None None None None None
221 310227749 00010 31022774900010 2008-01-01 1900-01-01 1900-01-01 None CENTRE DE SOINS None None ... None None None None None None None None None None

10536 rows × 91 columns

[14]:
# count general practioners by commun
hdLegalStatus = doctors.groupby(['categorieJuridiqueUniteLegale'], dropna=True)['effectifsMinEtablissement'].agg(['mean', "count"])

hdLegalStatus["meanHeadcountLegalStatus"] = hdLegalStatus["mean"].apply(lambda x: _replace_nan(x, 1))

hdLegalStatus["categorieJuridiqueUniteLegale"] = hdLegalStatus.index
hdLegalStatus.loc[hdLegalStatus["categorieJuridiqueUniteLegale"] == "1000", "meanHeadcountLegalStatus"] = 1
hdLegalStatus = hdLegalStatus.reset_index(drop=True)

doctors = doctors.merge(hdLegalStatus, how = "left", on = "categorieJuridiqueUniteLegale").reset_index(drop=True)

for r in doctors.index:
    hdMin = doctors.loc[r, "effectifsMinEtablissement"]
    if (hdMin is None) or (numpy.isnan(hdMin)):
        doctors.loc[r, "headcount"] = doctors.loc[r, "meanHeadcountLegalStatus"]
    else:
        doctors.loc[r, "headcount"] = hdMin
[15]:
# duplicates cleaning
# normalization to avoid double counting
doctors["headcountFinal"] = doctors["headcount"] / doctors.groupby("siren", dropna=True)["headcount"].transform("sum")

#visual check duplicates
dupDoctors = doctors[doctors["siren"].duplicated(keep=False)].reset_index(drop=True)
dupDoctors.to_csv("dupDoctors.csv")

list_adress_var = ["numeroVoieEtablissement", "typeVoieEtablissementLibelle",
                   "libelleVoieEtablissement", "codePostalEtablissement", "libelleCommuneEtablissement"]

addressDupDoctors = doctors[doctors[list_adress_var].duplicated(keep=False)].reset_index(drop=True)
addressDupDoctors = addressDupDoctors.sort_values(list_adress_var)
addressDupDoctors.to_csv("addressDupDoctors.csv")
[16]:
doctors.sort_values(["libelleCommuneEtablissement"]).to_csv("doctorSortedCom.csv")
[17]:
# count general practioners by commune
doctorsCom = doctors.groupby(['libelleCommuneEtablissement', 'codePostalEtablissement'])['headcountFinal'].agg(['sum'])
doctorsCom = doctorsCom.rename(columns = {"sum" : "headcount"})
doctorsCom
[17]:
headcount
libelleCommuneEtablissement codePostalEtablissement
ABLON-SUR-SEINE 94480 2.000000
ALFORTVILLE 94140 31.250000
ANTONY 92160 144.601926
ARCUEIL 94110 15.500000
ASNIERES-SUR-SEINE 92600 63.750000
... ... ...
VILLEPINTE 93420 24.000000
VILLETANEUSE 93430 4.000000
VILLIERS-SUR-MARNE 94350 15.000000
VINCENNES 94300 76.500000
VITRY-SUR-SEINE 94400 86.250000

147 rows × 1 columns

[18]:
doctorsCom["libelleCommuneEtablissement"] = doctorsCom.index.get_level_values("libelleCommuneEtablissement")
doctorsCom["codePostalEtablissement"] = doctorsCom.index.get_level_values("codePostalEtablissement")
doctorsCom = doctorsCom.reset_index(drop=True)
[19]:
import os, sys

communes = get_geo_list("communes")

def _find_commune_identifier(com, dep, communes=communes):

    try:
        list_com = [c.upper() for c in communes.TITLE.to_list()]
        communes["TITLE"] = list_com
        communes = communes[communes["CODE_DEP"] == dep].reset_index(drop=True)

        match = difflib.get_close_matches(com.upper(), list_com)[0]

        cog = communes[communes["TITLE"] == match].reset_index(drop=True)
        cog = cog.loc[0, "CODE"]
    except:
        cog = None

    return cog

for r in doctorsCom.index:
    dep = doctorsCom.loc[r, "codePostalEtablissement"][:2]
    comName = doctorsCom.loc[r, "libelleCommuneEtablissement"]
    doctorsCom.loc[r, "insee_com"] = _find_commune_identifier(comName, dep)

[20]:
comDep = com[com["insee_dep" ].isin(list_dep)].reset_index(drop=True)
doctorsCom2 = comDep.merge(doctorsCom, how = "left", on = "insee_com")
[21]:
doctorsCom2["headcount"] = doctorsCom2["headcount"].apply(lambda x: _replace_nan(x, 0))
[22]:
doctorsCom2["headcount"] = doctorsCom2["headcount"].apply(lambda x: _replace_nan(x, 0))
doctorsCom2["ratio"] = 1000 * doctorsCom2["headcount"] / doctorsCom2["population"]
[23]:
doctorsCom2 = doctorsCom2.reset_index(drop=True)

doctorsCom4 = doctorsCom2[doctorsCom2.columns].merge(arrmun, left_on ="libelleCommuneEtablissement", right_on="nom_m2", how="left")
for i in doctorsCom4.index:
    if type(doctorsCom4.loc[i, 'geometry2']) in [MultiPolygon, Polygon]:
        doctorsCom4.loc[i, 'geometry'] = doctorsCom4.loc[i, 'geometry2']
        doctorsCom4.loc[i, "ratio"] = 1000 * doctorsCom4.loc[i,"headcount"] / doctorsCom4.loc[i,"population2"]

del doctorsCom4["geometry2"]
doctorsCom2Map = gpd.GeoDataFrame(doctorsCom4)
[24]:
doctorsCom2Map = gpd.GeoDataFrame(doctorsCom4)
[25]:
# compute density ranges of GP
doctorsCom2Map.loc[doctorsCom2Map.ratio < 1, 'range'] = "< 0.2"
doctorsCom2Map.loc[doctorsCom2Map.ratio >= 5, 'range'] = "> 5"

density_ranges = [0.2, 0.6, 1, 1.5, 2, 2.5, 3, 3.5, 5]
list_ranges = []
list_ranges.append( "< 0.2")

for i in range(len(density_ranges)-1):
    min_range = density_ranges[i]
    max_range = density_ranges[i+1]
    range_string = "[{}, {}[".format(min_range, max_range)
    rows = (doctorsCom2Map.ratio >= min_range) & (doctorsCom2Map.ratio < max_range)
    doctorsCom2Map.loc[rows, 'range'] = range_string
    list_ranges.append(range_string)

list_ranges.append("> 5")

doctorsCom2Map['range2'] = doctorsCom2Map['range'].astype(CategoricalDtype(categories=list_ranges, ordered=True))
[26]:
# make plot
txt = 'In some cities, the data can be biased because some GP may team up in one office but they do not attend everyday'
txt += '\nConsequently, you should not considered this plot as official statistics'

fig, ax = plt.subplots(1,1,figsize=[10,10])
doctorsCom2Map.plot(column='range2', cmap=cm.viridis,
    legend=True, ax=ax,
    legend_kwds={'bbox_to_anchor': (1.1, 0.8)})
ax.set_axis_off()
ax.set(title='Distribution of doctors in parisian departements\nunit: 1 doctor/ 1000 inhabitants')
at = AnchoredText(txt, prop=dict(size=9), frameon=True, loc=4)
ax.add_artist(at)
plt.show()
../_images/examples_example_doctors_idf_25_0.png
[ ]: