General Practitioners in parisian departements

[33]:
from pynsee.sirene import get_dimension_list, search_sirene
from pynsee.metadata import get_activity_list
from pynsee.geodata import get_geodata_list, get_geodata
from pynsee.localdata import get_geo_list

import difflib
import geopandas as gpd
import numpy
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import descartes
import re
from shapely.geometry import Polygon, MultiPolygon
from matplotlib.offsetbox import AnchoredText
[34]:
import logging
import sys
logging.basicConfig(stream=sys.stdout,
                    level=logging.INFO,
                    format="%(message)s")
[35]:
get_dimension_list().to_csv("sirene_dimensions.csv")
Previously saved data has been used:
/home/tfardet/.cache/pynsee/pynsee/3e054427d1b13bfc770c9c256b2dbe46.parquet
Creation date: 2025-02-07, today
Set update=True to get the most up-to-date data
[36]:
# get activity list
naf5 = get_activity_list('NAF5')
naf5.to_csv("naf.csv")
naf5[naf5["NAF5"].str.contains("^86.2")]
[36]:
A10 A129 A17 A21 A38 A5 A64 A88 NAF1 NAF2 NAF3 NAF4 NAF5 TITLE_NAF5_40CH_FR TITLE_NAF5_65CH_FR TITLE_NAF5_FR
664 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.21 86.21Z Activité des médecins généralistes Activité des médecins généralistes Activité des médecins généralistes
665 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.22 86.22A Act. radiodiagnostic et de radiothérapie Activités de radiodiagnostic et de radiothérapie Activités de radiodiagnostic et de radiothérapie
666 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.22 86.22B Activités chirurgicales Activités chirurgicales Activités chirurgicales
667 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.22 86.22C Autre activité des médecins spécialistes Autres activités des médecins spécialistes Autres activités des médecins spécialistes
668 OQ Q86Z OQ Q QA OQ QA0 86 Q 86 86.2 86.23 86.23Z Pratique dentaire Pratique dentaire Pratique dentaire
[37]:
# departements list of ile de france
list_dep = ["92", "93", "75", "94"] # + ["77", "78", "95", "91"]
string_dep = "|".join([dep + "*" for dep in list_dep])
string_dep
[37]:
'92*|93*|75*|94*'
[58]:
# search data on doctors

doctors = search_sirene(variable = ["activitePrincipaleEtablissement", "codePostalEtablissement"],
                        pattern = ['86.21Z*', string_dep],
                        number = 100000)
 # '86.21Z' médecin généraliste

doctors = doctors[doctors["activitePrincipaleEtablissement"].str.contains("^86.21Z")].reset_index(drop=True)
Previously saved data has been used:
/home/tfardet/.cache/pynsee/pynsee/78cebe2f46279956f9948c8938f24c53.parquet
Creation date: 2025-02-07, today
Set update=True to get the most up-to-date data
[59]:
# filter
list_pattern = ["RADIOLOGIE", "IMAGERIE", "GYNECO", "DENTAIRE", "ANESTHES", 'P.M.I', 'INFANTILE', 'CARDIOLOG',
                "MAXILLO", "HOSPITALIER", "OPHTALMO", "DERMATOLOG", 'PLANIFICATION', 'MAIRIE', 'THORACIQUE',
                "TOMODENSITOM", "URGENCE", "NEPHROLOG", "PRAXEA", 'VACCINATION', 'SCOLAIRE', 'CANCER', 'STOMATOLO',
                "URODYNAMIQ", "CARDIORIS", "CHIRPLASTIQUE", "NINAMAX", "CICOF", 'MATERNELLE', 'DIALYSE',
                'RADIOTHERAPIE', 'RHUMATOLOG', 'DENTISTE', 'ECHOGRAPH', 'PATHOLOG', 'CHIRURGIE', 'ESTHETIQ',
                'CYTOLOG', 'ORTHOPED']

pattern = r"|".join(list_pattern)
[60]:
# clean up the data not related to general practitioners but to other health activities
doctors = doctors[~doctors["denominationUniteLegale"].str.contains(pattern, regex=True, na=False)].reset_index(drop=True)
doctors = doctors[~doctors["enseigne1Etablissement"].str.contains(pattern, regex=True, na=False)].reset_index(drop=True)

# exclude association and administration
doctors = doctors[~doctors["categorieJuridiqueUniteLegale"].isin(["9220", "7220"])].reset_index(drop=True)
doctors = doctors.sort_values(["categorieJuridiqueUniteLegale"])

doctors.to_csv("doctors.csv")
list_denomination_uniteLegale = doctors.denominationUniteLegale.unique()
[ ]:
# get geographical data list
geodata_list = get_geodata_list()

# get communes geographical limits
com = get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:commune')

# get arrondissement (subdivision of departement) geographical limits
arr = get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:arrondissement')
Previously saved data has been used:
/home/tfardet/.cache/pynsee/pynsee/e879822687bab4b12f23762b5d56742b.parquet
Creation date: 2025-02-07, today
Set update=True to get the most up-to-date data
[42]:
# get arrondissement municipal geographical limits
arrmun = get_geodata('ADMINEXPRESS-COG-CARTO.LATEST:arrondissement_municipal')
arrmun = arrmun[arrmun.insee_com.str.contains('^75')].reset_index(drop=True)
arrmun["nom_m"] = arrmun["nom_m"].apply(lambda x: re.sub(r"ER|E", "", re.sub(" ARRONDISSEMENT", "", x)))
arrmun = arrmun.rename(columns={"geometry" : "geometry2",
                                "population" : "population2",
                               "nom_m" : "nom_m2"})
arrmun = arrmun[["nom_m2", "geometry2", "population2"]]
Previously saved data has been used:
/home/tfardet/.cache/pynsee/pynsee/25e148c283d20f478d44c0ed19a75b73.parquet
Creation date: 2025-02-07, today
Set update=True to get the most up-to-date data
[43]:
def _replace_nan(x, y):
    if (x is None) or (numpy.isnan(x)):
        return y
    else:
        return x
[44]:
doctors
[44]:
siren nic siret dateDebut dateCreationEtablissement dateCreationUniteLegale dateFin denominationUniteLegale nomUniteLegale prenomUsuelUniteLegale ... codePostal2Etablissement libelleCommune2Etablissement libelleCommuneEtranger2Etablissement distributionSpeciale2Etablissement codeCommune2Etablissement codeCedex2Etablissement libelleCedex2Etablissement codePaysEtranger2Etablissement libellePaysEtranger2Etablissement enseigne3Etablissement
9754 934302480 00017 93430248000017 2024-11-01 2024-11-01 2024-11-01 None None MAKNI SAMIA ... None None None None None None None None None None
5509 494498017 00040 49449801700040 2021-01-04 2021-01-04 2006-01-17 None None HERAULT NATHALIE ... None None None None None None None None None None
5508 494422066 00030 49442206600030 2014-07-07 2014-07-07 2006-06-09 None None VENTRE MARIE ... None None None None None None None None None None
5507 494390701 00022 49439070100022 2015-04-01 2015-04-01 2007-01-01 None None KARRAZ WAEL ... None None None None None None None None None None
5506 494260391 00045 49426039100045 2019-11-28 2019-11-28 2006-11-09 None None BOTHNER LAURENT ... None None None None None None None None None None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6262 775665805 00468 77566580500468 2008-01-01 1996-12-25 1900-01-01 None COMPAGNIE FILLES CHARITE ST VINCENT PAUL None None ... None None None None None None None None None None
6274 775683006 00362 77568300600362 2017-06-21 2017-06-21 1900-01-01 None FONDATION SANTE DES ETUDIANTS DE FRANCE None None ... None None None None None None None None None None
6261 775663883 00020 77566388300020 2008-01-01 1900-01-01 1900-01-01 None INSTITUT ARTHUR VERNES None None ... None None None None None None None None None None
268 313711038 00015 31371103800015 2008-01-01 1978-01-01 1978-01-01 None FONDATION HAHNEMANN None None ... None None None None None None None None None None
206 310227749 00010 31022774900010 2008-01-01 1900-01-01 1900-01-01 None CENTRE DE SOINS None None ... None None None None None None None None None None

10043 rows × 96 columns

[45]:
# count general practioners by commun
hdLegalStatus = doctors.groupby(['categorieJuridiqueUniteLegale'], dropna=True)['effectifsMinEtablissement'].agg(['mean', "count"])

hdLegalStatus["meanHeadcountLegalStatus"] = hdLegalStatus["mean"].apply(lambda x: _replace_nan(x, 1))

hdLegalStatus["categorieJuridiqueUniteLegale"] = hdLegalStatus.index
hdLegalStatus.loc[hdLegalStatus["categorieJuridiqueUniteLegale"] == "1000", "meanHeadcountLegalStatus"] = 1
hdLegalStatus = hdLegalStatus.reset_index(drop=True)

doctors = doctors.merge(hdLegalStatus, how = "left", on = "categorieJuridiqueUniteLegale").reset_index(drop=True)

for r in doctors.index:
    hdMin = doctors.loc[r, "effectifsMinEtablissement"]
    if (hdMin is None) or (numpy.isnan(hdMin)):
        doctors.loc[r, "headcount"] = doctors.loc[r, "meanHeadcountLegalStatus"]
    else:
        doctors.loc[r, "headcount"] = hdMin
[46]:
# duplicates cleaning
# normalization to avoid double counting
doctors["headcountFinal"] = doctors["headcount"] / doctors.groupby("siren", dropna=True)["headcount"].transform("sum")

#visual check duplicates
dupDoctors = doctors[doctors["siren"].duplicated(keep=False)].reset_index(drop=True)
dupDoctors.to_csv("dupDoctors.csv")

list_adress_var = ["numeroVoieEtablissement", "typeVoieEtablissementLibelle",
                   "libelleVoieEtablissement", "codePostalEtablissement", "libelleCommuneEtablissement"]

addressDupDoctors = doctors[doctors[list_adress_var].duplicated(keep=False)].reset_index(drop=True)
addressDupDoctors = addressDupDoctors.sort_values(list_adress_var)
addressDupDoctors.to_csv("addressDupDoctors.csv")
[47]:
doctors.sort_values(["libelleCommuneEtablissement"]).to_csv("doctorSortedCom.csv")
[48]:
# count general practioners by commune
doctorsCom = doctors.groupby(['libelleCommuneEtablissement', 'codePostalEtablissement'])['headcountFinal'].agg(['sum'])
doctorsCom = doctorsCom.rename(columns = {"sum" : "headcount"})
doctorsCom
[48]:
headcount
libelleCommuneEtablissement codePostalEtablissement
ABLON-SUR-SEINE 94480 2.000000
ALFORTVILLE 94140 29.250000
ANTONY 92160 131.406046
ARCUEIL 94110 14.500000
ASNIERES-SUR-SEINE 92600 67.629197
... ... ...
VILLETANEUSE 93430 5.000000
VILLIERS SUR MARNE 94350 1.000000
VILLIERS-SUR-MARNE 94350 13.250000
VINCENNES 94300 70.640814
VITRY-SUR-SEINE 94400 79.257402

189 rows × 1 columns

[49]:
doctorsCom["libelleCommuneEtablissement"] = doctorsCom.index.get_level_values("libelleCommuneEtablissement")
doctorsCom["codePostalEtablissement"] = doctorsCom.index.get_level_values("codePostalEtablissement")
doctorsCom = doctorsCom.reset_index(drop=True)
[50]:
import os, sys

communes = get_geo_list("communes")

def _find_commune_identifier(com, dep, communes=communes):

    try:
        list_com = [c.upper() for c in communes.TITLE.to_list()]
        communes["TITLE"] = list_com
        communes = communes[communes["CODE_DEP"] == dep].reset_index(drop=True)

        match = difflib.get_close_matches(com.upper(), list_com)[0]

        cog = communes[communes["TITLE"] == match].reset_index(drop=True)
        cog = cog.loc[0, "CODE"]
    except:
        cog = None

    return cog

for r in doctorsCom.index:
    dep = doctorsCom.loc[r, "codePostalEtablissement"][:2]
    comName = doctorsCom.loc[r, "libelleCommuneEtablissement"]
    doctorsCom.loc[r, "insee_com"] = _find_commune_identifier(comName, dep)
Previously saved data has been used:
/home/tfardet/.cache/pynsee/pynsee/9807b8fe5399f855c1c5cf4d46ae21a7.parquet
Creation date: 2025-02-07, today
Set update=True to get the most up-to-date data
[51]:
comDep = com[com["insee_dep" ].isin(list_dep)].reset_index(drop=True)
doctorsCom2 = comDep.merge(doctorsCom, how = "left", on = "insee_com")
[52]:
doctorsCom2["headcount"] = doctorsCom2["headcount"].apply(lambda x: _replace_nan(x, 0))
[53]:
doctorsCom2["headcount"] = doctorsCom2["headcount"].apply(lambda x: _replace_nan(x, 0))
doctorsCom2["ratio"] = 1000 * doctorsCom2["headcount"] / doctorsCom2["population"]
[54]:
doctorsCom2 = doctorsCom2.reset_index(drop=True)

doctorsCom4 = doctorsCom2[doctorsCom2.columns].merge(arrmun, left_on ="libelleCommuneEtablissement", right_on="nom_m2", how="left")
for i in doctorsCom4.index:
    if type(doctorsCom4.loc[i, 'geometry2']) in [MultiPolygon, Polygon]:
        doctorsCom4.loc[i, 'geometry'] = doctorsCom4.loc[i, 'geometry2']
        doctorsCom4.loc[i, "ratio"] = 1000 * doctorsCom4.loc[i,"headcount"] / doctorsCom4.loc[i,"population2"]

del doctorsCom4["geometry2"]
doctorsCom2Map = gpd.GeoDataFrame(doctorsCom4)
[55]:
doctorsCom2Map = gpd.GeoDataFrame(doctorsCom4)
[56]:
# compute density ranges of GP
doctorsCom2Map.loc[doctorsCom2Map.ratio < 1, 'range'] = "< 0.2"
doctorsCom2Map.loc[doctorsCom2Map.ratio >= 5, 'range'] = "> 5"

density_ranges = [0.2, 0.6, 1, 1.5, 2, 2.5, 3, 3.5, 5]
list_ranges = []
list_ranges.append( "< 0.2")

for i in range(len(density_ranges)-1):
    min_range = density_ranges[i]
    max_range = density_ranges[i+1]
    range_string = "[{}, {}[".format(min_range, max_range)
    rows = (doctorsCom2Map.ratio >= min_range) & (doctorsCom2Map.ratio < max_range)
    doctorsCom2Map.loc[rows, 'range'] = range_string
    list_ranges.append(range_string)

list_ranges.append("> 5")

doctorsCom2Map['range2'] = doctorsCom2Map['range'].astype(CategoricalDtype(categories=list_ranges, ordered=True))
[26]:
# make plot
txt = 'In some cities, the data can be biased because some GP may team up in one office but they do not attend everyday'
txt += '\nConsequently, you should not considered this plot as official statistics'

fig, ax = plt.subplots(1,1,figsize=[10,10])
doctorsCom2Map.plot(column='range2', cmap=cm.viridis,
    legend=True, ax=ax,
    legend_kwds={'bbox_to_anchor': (1.1, 0.8)})
ax.set_axis_off()
ax.set(title='Distribution of doctors in parisian departements\nunit: 1 doctor/ 1000 inhabitants')
at = AnchoredText(txt, prop=dict(size=9), frameon=True, loc=4)
ax.add_artist(at)
plt.show()
../_images/examples_example_doctors_idf_25_0.png
[ ]: