ceciliamacias
commited on
Commit
•
b69971d
1
Parent(s):
4450bc4
Create README.md
Browse files
README.md
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Identificación de retinopatías
|
2 |
+
|
3 |
+
El Propósito del siguiente trabajo es identificar los pacientes que tienen complicaciones diabéticas, como lo son la neuropatía, nefropatía y retinopatía de notas médicas. Es el trabajo final del curso Clinical Natural Language Processing impartido en Coursera. Las notas medicas se encuentran en el siguiente linklink para su entrenamiento del modelo:
|
4 |
+
|
5 |
+
https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/diabetes_notes.csv
|
6 |
+
|
7 |
+
Y los datos para su validación se encuentran en el siguiente link:
|
8 |
+
|
9 |
+
https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/glodstandrad.csv
|
10 |
+
|
11 |
+
En primera instancia, se crea el siguiente código para ignorar los warnings:
|
12 |
+
|
13 |
+
```python
|
14 |
+
|
15 |
+
import warnings
|
16 |
+
warnings.filterwarnings("ignore", 'This pattern has match groups')
|
17 |
+
datos = "https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/diabetes_notes.csv"
|
18 |
+
df = pd.read_csv(datos)
|
19 |
+
|
20 |
+
|
21 |
+
# Importando las paqueterías necesarias:
|
22 |
+
import pandas as pd
|
23 |
+
import matplotlib.pyplot as plt
|
24 |
+
import re
|
25 |
+
import numpy as np
|
26 |
+
from sklearn.metrics import confusion_matrix, classification_report
|
27 |
+
|
28 |
+
# Lectura de datos
|
29 |
+
datos = "https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/diabetes_notes.csv"
|
30 |
+
df = pd.read_csv(datos)
|
31 |
+
|
32 |
+
# Análisis grafico de los datos
|
33 |
+
fig, ax = plt.subplots()
|
34 |
+
ax.bar(df['NOTE_ID'],df['TEXT'].str.split().apply(len))
|
35 |
+
|
36 |
+
# Cantidad de palabras por reporte de cada paciente identificado por un id
|
37 |
+
conteo = df['TEXT'].str.split().apply(len).tolist()
|
38 |
+
print('Media de palabras: ' + str(np.mean(conteo)))
|
39 |
+
print('Mediana de palabras: ' + str(np.median(conteo)))
|
40 |
+
print('Minimo de palabras: ' + str(np.min(conteo)))
|
41 |
+
print('Maximo de palabras: ' + str(np.max(conteo)))
|
42 |
+
|
43 |
+
def reporte_paciente(id):
|
44 |
+
resumen = re.findall(r"\w+", str(df[df.NOTE_ID == id]['TEXT'].tolist() ))
|
45 |
+
return resumen
|
46 |
+
|
47 |
+
# print(reporte_paciente(1))
|
48 |
+
|
49 |
+
```
|
50 |
+
|
51 |
+
Ahora bien, se genera una función la cual recibe nuestro DataFrame con las notas médicas, la palabra a buscar y el tamaño de la ventana
|
52 |
+
|
53 |
+
## Función sin expresiones regulares
|
54 |
+
```python
|
55 |
+
|
56 |
+
def extract_text_window(df, word, window_size, column_name = "TEXT"):
|
57 |
+
|
58 |
+
#Constants
|
59 |
+
user_input = f'({word})'
|
60 |
+
regex = re.compile(user_input)
|
61 |
+
|
62 |
+
negative = f'(no history of {word}|No history of {word}|any comorbid complications|family history|father also has {word}|denies {word}|Negative for {word})'
|
63 |
+
regex_negative = re.compile(negative)
|
64 |
+
|
65 |
+
half_window_size = window_size
|
66 |
+
final_df = pd.DataFrame([])
|
67 |
+
column_position = df.columns.get_loc(column_name) + 1 #We add 1 cause position 0 is the index
|
68 |
+
|
69 |
+
|
70 |
+
#Loop for each row of the column
|
71 |
+
for row in df.itertuples():
|
72 |
+
|
73 |
+
#Loop for multiple matches in the same row
|
74 |
+
for match in regex.finditer(row[column_position]):
|
75 |
+
|
76 |
+
window_start = int([match.start()-half_window_size if match.start()>=half_window_size else 0][0])
|
77 |
+
window_end = int([match.end() + half_window_size if match.end()+half_window_size <= len(row[column_position]) else len(row[column_position])][0])
|
78 |
+
|
79 |
+
|
80 |
+
final_df = final_df.append({
|
81 |
+
"WORD": match.group(),
|
82 |
+
"START_INDEX": match.start(),
|
83 |
+
"WINDOW_START": window_start,
|
84 |
+
"WINDOW_END": window_end,
|
85 |
+
"CONTEXT": row[column_position][window_start:window_end],
|
86 |
+
"FULL_TEXT": row[column_position],
|
87 |
+
"NOTE_ID": row[1]},
|
88 |
+
ignore_index=True)
|
89 |
+
#Extracción de negativos
|
90 |
+
for match in regex_negative.finditer(row[column_position]):
|
91 |
+
final_df2 = final_df[final_df["CONTEXT"].str.contains(pat = regex_negative, regex = True)==False]
|
92 |
+
|
93 |
+
return "No matches for the pattern" if len(final_df) == 0 else final_df2
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
# Buscando diabet en las notas médicas
|
98 |
+
df = pd.read_csv("https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/diabetes_notes.csv")
|
99 |
+
word = "diabet"
|
100 |
+
window_size = 50 #tamaño de la ventana
|
101 |
+
|
102 |
+
diabetes_notes_window = extract_text_window(df,word,window_size)
|
103 |
+
|
104 |
+
diabetes_notes_window
|
105 |
+
```
|
106 |
+
|
107 |
+
Se crea una segunda función la cual recibe nuestro DataFrame con nuestras notas médicas, nuestra expresión regular para la palabra a buscar, expresión regular para las expresiones como "historial familiar, no tiene historial de diabetes, no se ha identificado diabetes" entre otras y el tamaño de la ventana al rededor de la palabra a buscar.
|
108 |
+
|
109 |
+
## Función con expresiones regulares
|
110 |
+
```python
|
111 |
+
|
112 |
+
def extract_text_window_pro(df, pattern,negatives, window_size, column_name = "TEXT"):
|
113 |
+
|
114 |
+
#Constants
|
115 |
+
half_window_size = window_size
|
116 |
+
final_df = pd.DataFrame([])
|
117 |
+
column_position = df.columns.get_loc(column_name) + 1 #We add 1 cause position 0 is the index
|
118 |
+
|
119 |
+
|
120 |
+
#Loop for each row of the column
|
121 |
+
for row in df.itertuples():
|
122 |
+
|
123 |
+
#Loop for multiple matches in the same row
|
124 |
+
for match in re.finditer(pattern,row[column_position]):
|
125 |
+
|
126 |
+
window_start = int([match.start()-half_window_size if match.start()>=half_window_size else 0][0])
|
127 |
+
window_end = int([match.end() + half_window_size if match.end()+half_window_size <= len(row[column_position]) else len(row[column_position])][0])
|
128 |
+
|
129 |
+
|
130 |
+
final_df = final_df.append({
|
131 |
+
"WORD": match.group(),
|
132 |
+
"START_INDEX": match.start(),
|
133 |
+
"WINDOW_START": window_start,
|
134 |
+
"WINDOW_END": window_end,
|
135 |
+
"CONTEXT": row[column_position][window_start:window_end],
|
136 |
+
"FULL_TEXT": row[column_position],
|
137 |
+
"NOTE_ID": row[1]},
|
138 |
+
ignore_index=True)
|
139 |
+
#Extracción de negativos
|
140 |
+
final_df2 = final_df[final_df["CONTEXT"].str.contains(pat = negatives, regex = True)==False]
|
141 |
+
|
142 |
+
return "No matches for the pattern" if len(final_df) == 0 else final_df2
|
143 |
+
|
144 |
+
|
145 |
+
# Buscando diabet en las notas médicas
|
146 |
+
|
147 |
+
df = pd.read_csv("https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/diabetes_notes.csv")
|
148 |
+
pattern = "diabetes|diabetic" #"(?<![a-zA-Z])diabet(es|ic)?(?![a-zA-Z])"
|
149 |
+
window_size = 50
|
150 |
+
negatives = r"no history of (?<![a-zA-Z])diabet(es|ic)?(?![a-zA-z])|No history of (?<![a-zA-Z])diabet(es|ic)?(?![a-zA-z])|den(ies|y)? any comorbid complications|family history|negative for (?<![a-zA-Z])diabet(es|ic)?(?![a-zA-z])|(father|mother) (also)? (?<![a-zA-Z])diabet(es|ic)?(?![a-zA-z])|Negative for (?<![a-zA-Z])diabet(es|ic)?(?![a-zA-z]) |no weakness, numbness or tingling|patient's mother and father|father also has diabetes"
|
151 |
+
|
152 |
+
|
153 |
+
diabetes_notes_window = extract_text_window_pro(df,pattern,negatives,window_size)
|
154 |
+
|
155 |
+
diabetes_notes_window
|
156 |
+
|
157 |
+
```
|
158 |
+
Ahora bien, es momento de obtiene mediante la función con expresiones regulares los DataFrame para neuropathy, nephropathy y retinopathy.
|
159 |
+
|
160 |
+
```python
|
161 |
+
diabetes_notes_window.drop_duplicates(subset=["NOTE_ID"])
|
162 |
+
neuropathy = diabetes_notes_window[diabetes_notes_window['CONTEXT'].str.contains(pat=r"(?<![a-zA-Z])neuropath(y|ic)?(?![a-zA-z])|diabetic nerve pain|tingling",regex=True)]
|
163 |
+
neuropathy['COMPLICATIONS'] = "neuropathy"
|
164 |
+
diabetes_notes_neuropathy = neuropathy[['NOTE_ID','CONTEXT','COMPLICATIONS']].drop_duplicates(subset=['NOTE_ID'])
|
165 |
+
|
166 |
+
|
167 |
+
print(diabetes_notes_neuropathy)
|
168 |
+
print(diabetes_notes_neuropathy.count())
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
nephropathy = diabetes_notes_window[diabetes_notes_window['CONTEXT'].str.contains(pat=r"(?<![a-zA-Z])nephropathy(?![a-zA-z])|renal (insufficiency|disease)",regex=True)]
|
174 |
+
nephropathy['COMPLICATIONS'] = "nephropathy"
|
175 |
+
diabetes_notes_nephropathy = nephropathy[['NOTE_ID','CONTEXT','COMPLICATIONS']].drop_duplicates(subset=['NOTE_ID'])
|
176 |
+
|
177 |
+
print(diabetes_notes_nephropathy)
|
178 |
+
print(diabetes_notes_nephropathy.count())
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
retinopathy = diabetes_notes_window[diabetes_notes_window['CONTEXT'].str.contains(pat=r"(?<![a-zA-Z])retinopath(y|ic)?(?![a-zA-z])",regex=True)]
|
184 |
+
retinopathy['COMPLICATIONS'] = "retinopathy"
|
185 |
+
diabetes_notes_retinopathy = retinopathy[['NOTE_ID','CONTEXT','COMPLICATIONS']].drop_duplicates(subset=['NOTE_ID'])
|
186 |
+
|
187 |
+
print(diabetes_notes_retinopathy)
|
188 |
+
print(diabetes_notes_retinopathy.count())
|
189 |
+
|
190 |
+
```
|
191 |
+
Para validar que nuestras funciones estén obteniendo bien la información de hace el uso del segundo link el cual se nos fue proporcionado para la validación de estas notas médicas.
|
192 |
+
|
193 |
+
```python
|
194 |
+
# Con el link antes mencionado de validación se crean los DataFrame para cada patología
|
195 |
+
|
196 |
+
datos_verificacion = pd.read_csv("https://raw.githubusercontent.com/hhsieh2416/Identify_Diabetic_Complications/main/data/glodstandrad.csv")
|
197 |
+
|
198 |
+
datos_verificacion_neuropathy = datos_verificacion[datos_verificacion['DIABETIC_NEUROPATHY']==1][['NOTE_ID','DIABETIC_NEUROPATHY']]
|
199 |
+
print(datos_verificacion_neuropathy)
|
200 |
+
print(datos_verificacion_neuropathy.count())
|
201 |
+
|
202 |
+
datos_verificacion_nephropathy = datos_verificacion[datos_verificacion['DIABETIC_NEPHROPATHY']==1][['NOTE_ID','DIABETIC_NEPHROPATHY']]
|
203 |
+
print(datos_verificacion_nephropathy)
|
204 |
+
print(datos_verificacion_nephropathy.count())
|
205 |
+
|
206 |
+
datos_verificacion_retinopathy = datos_verificacion[datos_verificacion['DIABETIC_RETINOPATHY']==1][['NOTE_ID','DIABETIC_RETINOPATHY']]
|
207 |
+
print(datos_verificacion_retinopathy)
|
208 |
+
print(datos_verificacion_retinopathy.count())
|
209 |
+
|
210 |
+
# Realizamos joins de nuestros DataFrame con las tablas de validación
|
211 |
+
|
212 |
+
ver_neuro = pd.merge(datos_verificacion_neuropathy, diabetes_notes_neuropathy, how = 'outer', on = 'NOTE_ID', indicator=True)
|
213 |
+
print(ver_neuro)
|
214 |
+
|
215 |
+
ver_nephro = pd.merge(datos_verificacion_nephropathy, diabetes_notes_nephropathy, how = 'outer', on = 'NOTE_ID', indicator=True)
|
216 |
+
print(ver_nephro)
|
217 |
+
|
218 |
+
ver_retino = pd.merge(datos_verificacion_retinopathy, diabetes_notes_retinopathy, how = 'outer', on = 'NOTE_ID', indicator=True)
|
219 |
+
print(ver_retino)
|
220 |
+
|
221 |
+
# Se realizan los conteos
|
222 |
+
|
223 |
+
conteo_na_neuro_falso_positivo = ver_neuro['DIABETIC_NEUROPATHY'].isna().sum()
|
224 |
+
conteo_na_nephro_falso_positivo = ver_nephro['DIABETIC_NEPHROPATHY'].isna().sum()
|
225 |
+
conteo_na_retino_falso_positivo = ver_retino['DIABETIC_RETINOPATHY'].isna().sum()
|
226 |
+
|
227 |
+
print('Pacientes sin complicaciones pero que si se identifican: ', conteo_na_neuro_falso_positivo+conteo_na_nephro_falso_positivo+conteo_na_retino_falso_positivo)
|
228 |
+
|
229 |
+
conteo_na_neuro_falso_negativo = ver_neuro['COMPLICATIONS'].isna().sum()
|
230 |
+
conteo_na_nephro_falso_negativo = ver_nephro['COMPLICATIONS'].isna().sum()
|
231 |
+
conteo_na_retino_falso_negativo = ver_retino['COMPLICATIONS'].isna().sum()
|
232 |
+
|
233 |
+
print('Pacientes con complicaciones que no fueron detectados: ', conteo_na_neuro_falso_negativo + conteo_na_nephro_falso_negativo + conteo_na_retino_falso_negativo)
|
234 |
+
|
235 |
+
conteo_correcto_neuro = len(ver_neuro[ver_neuro['_merge'] == 'both'])
|
236 |
+
|
237 |
+
conteo_correcto_nephro = len(ver_nephro[ver_nephro['_merge'] == 'both'])
|
238 |
+
|
239 |
+
conteo_correcto_retino = len(ver_retino[ver_retino['_merge'] == 'both'])
|
240 |
+
|
241 |
+
|
242 |
+
print('Pacientes que tienen complicaciones diabetes que si se encontaron: ', conteo_correcto_nephro+conteo_correcto_neuro+conteo_correcto_retino)
|
243 |
+
|
244 |
+
conteo_complicacion_neuro = len( ver_neuro[ver_neuro['DIABETIC_NEUROPATHY'] == 1] )
|
245 |
+
conteo_complicacion_nephro = len( ver_nephro[ver_nephro['DIABETIC_NEPHROPATHY'] == 1] )
|
246 |
+
conteo_complicacion_retino = len( ver_retino[ver_retino['DIABETIC_RETINOPATHY'] == 1] )
|
247 |
+
print('Pacientes que tienen complicaciones diabeticas: ', conteo_complicacion_neuro +conteo_complicacion_nephro + conteo_complicacion_retino )
|
248 |
+
|
249 |
+
|
250 |
+
|
251 |
+
cor_neuro = datos_verificacion[['NOTE_ID', 'DIABETIC_NEUROPATHY']].merge(diabetes_notes_neuropathy[['NOTE_ID','COMPLICATIONS']], how='outer', on='NOTE_ID', indicator=True )
|
252 |
+
cor_neuro['COMPLICATIONS'] = cor_neuro['COMPLICATIONS'].map(d_neuro).fillna(0)
|
253 |
+
|
254 |
+
print('---NEUROPATHY---')
|
255 |
+
print(cor_neuro)
|
256 |
+
|
257 |
+
print(classification_report(cor_neuro['DIABETIC_NEUROPATHY'].tolist(), cor_neuro['COMPLICATIONS'].tolist()))
|
258 |
+
|
259 |
+
cor_nephro = datos_verificacion[['NOTE_ID', 'DIABETIC_NEPHROPATHY']].merge(diabetes_notes_nephropathy[['NOTE_ID','COMPLICATIONS']], how='outer', on='NOTE_ID', indicator=True )
|
260 |
+
cor_nephro['COMPLICATIONS'] = cor_nephro['COMPLICATIONS'].map(d_nephro).fillna(0)
|
261 |
+
print('---NEPHROPATHY---')
|
262 |
+
print(cor_nephro)
|
263 |
+
|
264 |
+
print(classification_report(cor_nephro['DIABETIC_NEPHROPATHY'].tolist(), cor_nephro['COMPLICATIONS'].tolist()))
|
265 |
+
|
266 |
+
cor_retino = datos_verificacion[['NOTE_ID', 'DIABETIC_RETINOPATHY']].merge(diabetes_notes_retinopathy[['NOTE_ID','COMPLICATIONS']], how='outer', on='NOTE_ID', indicator=True )
|
267 |
+
cor_retino['COMPLICATIONS'] = cor_retino['COMPLICATIONS'].map(d_retino).fillna(0)
|
268 |
+
print('---RETINOPATHY---')
|
269 |
+
print(cor_retino)
|
270 |
+
|
271 |
+
print(classification_report(cor_retino['DIABETIC_RETINOPATHY'].tolist(), cor_retino['COMPLICATIONS'].tolist()))
|
272 |
+
```
|