Commit 758a35c7 authored by Sarah Abrishami's avatar Sarah Abrishami

init

parent f6783012
Pipeline #811 canceled with stages
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="DuplicatedCode" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<Languages>
<language minSize="276" name="Python" />
</Languages>
</inspection_tool>
<inspection_tool class="HttpUrlsUsage" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredUrls">
<list>
<option value="http://localhost" />
<option value="http://127.0.0.1" />
<option value="http://0.0.0.0" />
<option value="http://www.w3.org/2000/svg" />
<option value="http://www.w3.org/2000/10/XMLSchema" />
<option value="http://www.w3.org/2001/XMLSchema" />
<option value="http://www.w3.org/2001/XMLSchema-instance" />
<option value="http://www.w3.org/2001/XInclude" />
<option value="http://www.w3.org/2003/03/wsdl" />
<option value="http://json-schema.org/draft" />
<option value="http://java.sun.com/xml/ns/" />
<option value="http://java.sun.com/jsp/" />
<option value="http://java.sun.com/JSP/" />
<option value="http://java.sun.com/j2ee/dtds/" />
<option value="http://java.sun.com/dtd/" />
<option value="http://xmlns.jcp.org/xml/ns/" />
<option value="http://javafx.com/javafx/" />
<option value="http://javafx.com/fxml" />
<option value="http://maven.apache.org/xsd/" />
<option value="http://maven.apache.org/POM/" />
<option value="http://www.springframework.org/schema/" />
<option value="http://www.springframework.org/tags" />
<option value="http://www.thymeleaf.org" />
<option value="http://www.jboss.org/j2ee/schema/" />
<option value="http://www.jboss.com/xml/ns/" />
<option value="http://www.ibm.com/webservices/xsd" />
<option value="http://activemq.apache.org/schema/" />
<option value="http://schema.cloudfoundry.org/spring/" />
<option value="http://schemas.xmlsoap.org/" />
<option value="http://schemas.android.com/" />
<option value="http://jakarta.apache.org/" />
<option value="http://cxf.apache.org/schemas/" />
<option value="http://10.1.1.198" />
<option value="http://{current_app.config[" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="11">
<item index="0" class="java.lang.String" itemvalue="openpyxl" />
<item index="1" class="java.lang.String" itemvalue="gunicorn" />
<item index="2" class="java.lang.String" itemvalue="pywin32" />
<item index="3" class="java.lang.String" itemvalue="chardet" />
<item index="4" class="java.lang.String" itemvalue="six" />
<item index="5" class="java.lang.String" itemvalue="certifi" />
<item index="6" class="java.lang.String" itemvalue="numpy" />
<item index="7" class="java.lang.String" itemvalue="pytz" />
<item index="8" class="java.lang.String" itemvalue="requests" />
<item index="9" class="java.lang.String" itemvalue="urllib3" />
<item index="10" class="java.lang.String" itemvalue="pandas" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (sepas)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/cima_dp.iml" filepath="$PROJECT_DIR$/.idea/cima_dp.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
import pandas as pd
import iarc_check as iarc
import datautils as du
d = pd.read_csv('./inpcr-93-96-data-final.csv', dtype={'ReportDate': str, 'BirthDate': str, 'NationalId': str,
'Year': pd.Int64Dtype(), 'Age': pd.Int64Dtype(),
'IsAudited': pd.Int64Dtype()})
fp = './static/mappings.xlsx'
#%% Prepare dates
d['BirthDate'] = d['BirthDate'].str.replace('/', '')
d['BirthDate'] = d['BirthDate'].str.replace('-', '')
d['ReportDate'] = d['ReportDate'].str.replace('-', '')
d['ReportDate'] = d['ReportDate'].str.replace('/', '')
#%%
d['Cfinal'] = d['Cfinal'].pipe(iarc.icdo_c_code_validation_correction, verbose=True)
d['Mfinal'] = d['Mfinal'].pipe(iarc.icdo_m_code_validation_correction, verbose=True)
#%%
m = du.get_map_from_excel(fp=fp, sheet_name='diagnosis', s='title', t='code')
d['DiagnosisMethodId'] = d['DiagnosisMethodName'].astype(str).pipe(du.set_map, mapping=m)
#%% Correct Sex Codes
rpldct = {'مرد': 'Male', 'زن': 'Female', '1': 'Male', '2': 'Female', '3': 'Unknown',
'مرد+E20558': 'Male', 'تعيين نشده': 'Unspecified', 'دوجنسي/نامشخص': 'Unknown'}
d['GenderName'] = d['GenderName'].replace(rpldct)
d['GenderName'] = d['GenderName'].fillna('Unknown')
m = du.get_map_from_excel(fp=fp, sheet_name='gender', s='Gender', t='GenderId')
d['GenderId'] = d['GenderName'].pipe(du.set_map, mapping=m)
#%%
m = du.get_map_from_excel(fp=fp, sheet_name='state', s='title', t='code')
d['StateId'] = d['ProvinceName_Home'].astype(str).pipe(du.set_map, mapping=m).str.zfill(2)
dd = d.loc[d['StateId'].isna()]['ProvinceName_Home'].value_counts().to_dict()
print(d['StateId'].value_counts(dropna=False))
print(d.loc[d['StateId'].isna()]['ProvinceName_Home'].value_counts())
#%% Export Data
d.to_csv('./static/data/inpcr-93-96-data-final-cleaned.csv')
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import khayyam as kym
from lifelines import KaplanMeierFitter
from lifelines.utils import datetimes_to_durations
from lifelines.datasets import load_dd
from lifelines.statistics import logrank_test
import itertools
import datautils as du
fp = './static/mappings.xlsx'
#%% Functions
def convert_date(x):
try:
return kym.JalaliDate(x[:4], x[4:6], x[6:8]).todate()
except:
return pd.NaT
#%% Get registry data
usecols = ['Year', 'NationalId', 'GenderId', 'ReportDate', 'Age', 'GenderId', 'Cfinal', 'Mfinal']
# d = pd.read_csv('./static/data/inpcr-93-96-data-final-cleaned.csv', usecols=usecols)
df = pd.read_csv('static/data/inpcr-93-96-data-with-mortality.csv')
df['AgeCat'] = pd.cut(df['Age'], list(range(0, 100, 10))+[np.inf], right=False)
dff = df.loc[(df['DiagnosisMethodName'] != 'Death Certificate') & (df['DeadOrAlive'].notna())].copy()
dff['ReportDate'] = dff['ReportDate'].astype(str).apply(convert_date)
dff['DeathDate'] = dff['DeathDate'].astype(str).apply(convert_date)
dff['is_dead'] = dff['DeadOrAlive'] == 'Dead'
# dff['event_date'] = dff['DeathDate']
dff['event_date'] = np.where(dff['DeathDate'].notna(), dff['DeathDate'], datetime(2020,7,17))
# dff['time_to_event'] = (pd.to_datetime(dff['event_date']) - pd.to_datetime(dff['ReportDate'])).dt.days
# dff['time_to_event'] = dff['time_to_event'].fillna(1825)
dff['time_to_event'], dff['is_dead'] = datetimes_to_durations(start_times=dff['ReportDate'],
end_times=dff['DeathDate'],
fill_date='2021-7-17',
freq='M')
#%%
def get_survival_at_time(data, timeline):
ix = data['time_to_event'] > 0
if len(data.loc[ix]) < 100:
return None
kmf = KaplanMeierFitter()
T = data.loc[ix, 'time_to_event']
E = data.loc[ix, 'is_dead']
try:
kmf.fit(T, event_observed=E, timeline=timeline)
except:
print(f'error processing {data.columns} combination')
return None
outframe = pd.concat([kmf.survival_function_at_times(timeline), kmf.confidence_interval_], axis=1) * 100
outframe.index.name = 'Measure'
outframe.rename(columns={'KM_estimate': 'Value',
'KM_estimate_lower_0.95': 'Lower',
'KM_estimate_upper_0.95': 'Upper'}, inplace=True)
return outframe
# Non-cancer, current, and past diagnosis
c = 'organ'
organs = ['Oesophagus', 'Breast', 'Stomach', 'Colon', 'Rectum', 'Liver', 'Pancreas', 'Trachea, bronchus and lung']
dims = ['Year', 'GenderId', 'OrganId', 'StateId']
index = ['Year', 'GenderId', 'OrganId', 'StateId']
m = du.get_map_from_excel(fp=fp, sheet_name='gender', s='Gender', t='GenderId')
dff['GenderId'] = dff['Gender'].pipe(du.set_map, mapping=m)
m = du.get_map_from_excel(fp=fp, sheet_name='organ', s='title', t='code')
dff['OrganId'] = dff['organ'].pipe(du.set_map, mapping=m)
m = du.get_map_from_excel(fp=fp, sheet_name='state', s='title', t='code')
dff['StateId'] = dff['StateName'].pipe(du.set_map, m)
dff = dff[dims + ['is_dead', 'time_to_event']]
dim_combs = itertools.chain(
*map(lambda r: itertools.combinations(dims, r),
range(1, len(dims) + 1)))
dfa = pd.DataFrame()
for comb in dim_combs:
try:
rndct = dict(zip(range(61), map(lambda x: f'srv{str(x).zfill(2)}m', range(61))))
tmp = dff[list(comb) + ['is_dead', 'time_to_event']].groupby(list(comb))\
.apply(lambda x: get_survival_at_time(x, np.linspace(0, 60, 21))) \
.round(2) \
.rename(index=rndct) \
.reset_index()
rndct = dict(zip(range(61), map(lambda x: f'srv{str(x).zfill(2)}m', range(61))))
# if tmp is not None:
dfa = pd.concat([dfa, tmp], axis=0)
except:
raise
# rndct = dict(zip(range(61), map(lambda x: f'srv{str(x).zfill(2)}m', range(61))))
#
# dfa = dfa.rename(columns=rndct)
# dfa = dfa.melt(id_vars=['Year', 'Gender', 'GenderId', 'OrganId', 'StateId'],
# value_vars=list(rndct.values()), var_name='Measure', value_name='Value')
dfa['Year'] = dfa.Year.astype(pd.Int64Dtype()).fillna(0).replace({0: 'All'})
dfa['GenderId'] = dfa.GenderId.fillna(999).replace({999: 'All'})
dfa.fillna('All', inplace=True)
dfa.to_csv('./SurvivalCube.csv')
# print(kmf.survival_function_at_times(range(1824, 1825, 1824)).rename({1824: cc}) * 100)
from datautils.mapping import *
import pandas as pd
def get_map_from_excel(fp, s, t, sheet_name=None):
d = pd.read_excel(fp, sheet_name=sheet_name, dtype={s: str, t: str})
return dict(zip(d[s], d[t]))
def set_map(s, mapping, ignore_missing=True):
if ignore_missing:
return s.replace(to_replace=mapping)
else:
return s.map(mapping)
from iarc_check.validators import *
import pandas as pd
def icdo_c_code_preprocessing(s):
return s.fillna('').str.replace(' ', '').str.upper()
def icdo_c_code_format_validator(s):
return s.str.fullmatch(r'C\d\d\.\d')
def icdo_c_code_corrector(s):
# Get codes which are correction candidates and correct them
ix = s.str.fullmatch(r'C\d\d\d')
s[ix] = s[ix].apply(lambda x: f'{x[:3]}.{x[3]}')
ix = s.str.fullmatch(r'\d\d\.\d')
s[ix] = s[ix].apply(lambda x: f'C{x}')
return s
def icdo_c_code_validation_correction(s, verbose=False):
s = s.pipe(icdo_c_code_preprocessing)
isvalid = s.pipe(icdo_c_code_format_validator)
if verbose:
print(f'C-Code validity status (pre-correction):\n {isvalid.value_counts()}')
s.loc[~isvalid] = s[~isvalid].pipe(icdo_c_code_corrector)
if verbose:
print(f'C-Code validity status (post-correction):\n {s.pipe(icdo_c_code_format_validator).value_counts()}')
return s
def icdo_m_code_preprocessing(s):
return s.fillna('').str.replace(' ', '').str.upper()
def icdo_m_code_format_validator(s):
return s.str.fullmatch(r'M\d{4}.\d')
def icdo_m_code_corrector(s):
# Get codes which are correction candidates
ix = s.str.fullmatch(r'C\d{4}.\d')
s[ix] = s[ix].apply(lambda x: f'M{x[1:5]}.{x[6]}')
ix = s.str.fullmatch(r'\d{4}.\d')
s[ix] = s[ix].apply(lambda x: f'M{x[:4]}.{x[5]}')
ix = s.str.fullmatch(r'M\d{4}/\d')
s[ix] = s[ix].apply(lambda x: f'{x[:5]}.{x[6]}')
ix = s.str.fullmatch(r'M\d{4}/\d')
s[ix] = s[ix].apply(lambda x: f'{x[:5]}.{x[6]}')
return s
def icdo_m_code_validation_correction(s, verbose=False):
s = s.pipe(icdo_m_code_preprocessing)
isvalid = s.pipe(icdo_m_code_format_validator)
if verbose:
print(f'C-Code validity status (pre-correction):\n {isvalid.value_counts()}')
s.loc[~isvalid] = s[~isvalid].pipe(icdo_m_code_corrector)
if verbose:
print(f'M-Code validity status (post-correction):\n {s.pipe(icdo_m_code_format_validator).value_counts()}')
return s
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment