Clasificacion de Diagnostico de Cancer de Mama con Machine Learning¶
Este analisis utiliza el dataset Wisconsin Breast Cancer para construir modelos de clasificacion que permitan distinguir entre tumores benignos y malignos a partir de caracteristicas extraidas de imagenes digitalizadas de biopsias por aspiracion con aguja fina (FNA). Se exploran tecnicas de preprocesamiento, manejo de desbalance de clases y multiples algoritmos de clasificacion.
import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time
# Librerias de clasificadores
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import collections
# Otras Librerias
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import xgboost as xgb
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
df = pd.read_csv('data/data.csv')
1. Analisis Exploratorio de Datos (EDA)¶
Comenzamos con un analisis exploratorio para comprender la estructura del dataset, identificar valores faltantes, detectar registros duplicados y obtener una vision general de las distribuciones de las variables.
# Convertimos diagnosis de B/M a 0/1
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})
df['diagnosis'].value_counts()
diagnosis 0 357 1 212 Name: count, dtype: int64
# definimos "y" como la variable diagnostico
#definimos a "X" como el resto de variables sin diagnostico, unnamed:32, id
y = df.diagnosis # M or B
list_drp = ['Unnamed: 32','id','diagnosis']
X = df.drop(list_drp,axis = 1 )
#cols_with_missing = [col for col in train.columns if train[col].isnull().any()]
cols_with_missing = df.isnull().sum()
cols_with_missing = cols_with_missing[cols_with_missing>0]
cols_with_missing.sort_values(inplace=True)
fig, ax = plt.subplots(figsize=(7,6))
width = 0.70 # the width of the bars
ind = np.arange(len(cols_with_missing)) # the x locations for the groups
ax.barh(ind, cols_with_missing, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(cols_with_missing.index, minor=False)
plt.xlabel('Count')
plt.ylabel('Features')
Text(0, 0.5, 'Features')
#vemos que unnamed esta vacía, por lo que la sacamos de la base de datos
lista=['Unnamed: 32','id']
df=df.drop(lista,axis = 1 )
# definimos "y" como la variable diagnostico
#definimos a "X" como el resto de variables sin diagnostico, unnamed:32, id
y = df.diagnosis # M or B
list_drp = ['diagnosis']
X = df.drop(list_drp,axis = 1 )
print('Los casos benignos son el', round(df['diagnosis'].value_counts()[0]/len(df) * 100,2), '% del dataset')
print('Los casos malignos son el', round(df['diagnosis'].value_counts()[1]/len(df) * 100,2), '% del dataset')
#graficamos el número de caos benignos y malignos, y de esta forma verificar si la clase es balanceada
ax = sns.countplot(y,label="Count") # M = 212, B = 357
B, M = y.value_counts()
print('Número de benignos: ',B)
print('Número de malignos : ',M)
ax.set_ylabel('Número de pacientes')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]
for left, right in zip(left_bars, right_bars):
height_l = left.get_height()
height_r = right.get_height()
total = height_l + height_r
ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")
Los casos benignos son el 62.74 % del dataset Los casos malignos son el 37.26 % del dataset
Número de benignos: 357 Número de malignos : 212
# Vemos si existen registros duplicados
dups = X.duplicated()
# hacemos print de los registros duplicados si es que existen
print(dups.any())
print(X[dups])
False Empty DataFrame Columns: [radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave points_worst, symmetry_worst, fractal_dimension_worst] Index: [] [0 rows x 30 columns]
#Hacemos un gráfico de las variables para ver las estadísticas descriptivas de cada una
#también nos permite ver que variables estan
X.describe().T.style.bar(subset=['mean'], color='#205ff2')\
.background_gradient(subset=['std'], cmap='Reds')\
.background_gradient(subset=['50%'], cmap='coolwarm')
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| radius_mean | 569.000000 | 14.127292 | 3.524049 | 6.981000 | 11.700000 | 13.370000 | 15.780000 | 28.110000 |
| texture_mean | 569.000000 | 19.289649 | 4.301036 | 9.710000 | 16.170000 | 18.840000 | 21.800000 | 39.280000 |
| perimeter_mean | 569.000000 | 91.969033 | 24.298981 | 43.790000 | 75.170000 | 86.240000 | 104.100000 | 188.500000 |
| area_mean | 569.000000 | 654.889104 | 351.914129 | 143.500000 | 420.300000 | 551.100000 | 782.700000 | 2501.000000 |
| smoothness_mean | 569.000000 | 0.096360 | 0.014064 | 0.052630 | 0.086370 | 0.095870 | 0.105300 | 0.163400 |
| compactness_mean | 569.000000 | 0.104341 | 0.052813 | 0.019380 | 0.064920 | 0.092630 | 0.130400 | 0.345400 |
| concavity_mean | 569.000000 | 0.088799 | 0.079720 | 0.000000 | 0.029560 | 0.061540 | 0.130700 | 0.426800 |
| concave points_mean | 569.000000 | 0.048919 | 0.038803 | 0.000000 | 0.020310 | 0.033500 | 0.074000 | 0.201200 |
| symmetry_mean | 569.000000 | 0.181162 | 0.027414 | 0.106000 | 0.161900 | 0.179200 | 0.195700 | 0.304000 |
| fractal_dimension_mean | 569.000000 | 0.062798 | 0.007060 | 0.049960 | 0.057700 | 0.061540 | 0.066120 | 0.097440 |
| radius_se | 569.000000 | 0.405172 | 0.277313 | 0.111500 | 0.232400 | 0.324200 | 0.478900 | 2.873000 |
| texture_se | 569.000000 | 1.216853 | 0.551648 | 0.360200 | 0.833900 | 1.108000 | 1.474000 | 4.885000 |
| perimeter_se | 569.000000 | 2.866059 | 2.021855 | 0.757000 | 1.606000 | 2.287000 | 3.357000 | 21.980000 |
| area_se | 569.000000 | 40.337079 | 45.491006 | 6.802000 | 17.850000 | 24.530000 | 45.190000 | 542.200000 |
| smoothness_se | 569.000000 | 0.007041 | 0.003003 | 0.001713 | 0.005169 | 0.006380 | 0.008146 | 0.031130 |
| compactness_se | 569.000000 | 0.025478 | 0.017908 | 0.002252 | 0.013080 | 0.020450 | 0.032450 | 0.135400 |
| concavity_se | 569.000000 | 0.031894 | 0.030186 | 0.000000 | 0.015090 | 0.025890 | 0.042050 | 0.396000 |
| concave points_se | 569.000000 | 0.011796 | 0.006170 | 0.000000 | 0.007638 | 0.010930 | 0.014710 | 0.052790 |
| symmetry_se | 569.000000 | 0.020542 | 0.008266 | 0.007882 | 0.015160 | 0.018730 | 0.023480 | 0.078950 |
| fractal_dimension_se | 569.000000 | 0.003795 | 0.002646 | 0.000895 | 0.002248 | 0.003187 | 0.004558 | 0.029840 |
| radius_worst | 569.000000 | 16.269190 | 4.833242 | 7.930000 | 13.010000 | 14.970000 | 18.790000 | 36.040000 |
| texture_worst | 569.000000 | 25.677223 | 6.146258 | 12.020000 | 21.080000 | 25.410000 | 29.720000 | 49.540000 |
| perimeter_worst | 569.000000 | 107.261213 | 33.602542 | 50.410000 | 84.110000 | 97.660000 | 125.400000 | 251.200000 |
| area_worst | 569.000000 | 880.583128 | 569.356993 | 185.200000 | 515.300000 | 686.500000 | 1084.000000 | 4254.000000 |
| smoothness_worst | 569.000000 | 0.132369 | 0.022832 | 0.071170 | 0.116600 | 0.131300 | 0.146000 | 0.222600 |
| compactness_worst | 569.000000 | 0.254265 | 0.157336 | 0.027290 | 0.147200 | 0.211900 | 0.339100 | 1.058000 |
| concavity_worst | 569.000000 | 0.272188 | 0.208624 | 0.000000 | 0.114500 | 0.226700 | 0.382900 | 1.252000 |
| concave points_worst | 569.000000 | 0.114606 | 0.065732 | 0.000000 | 0.064930 | 0.099930 | 0.161400 | 0.291000 |
| symmetry_worst | 569.000000 | 0.290076 | 0.061867 | 0.156500 | 0.250400 | 0.282200 | 0.317900 | 0.663800 |
| fractal_dimension_worst | 569.000000 | 0.083946 | 0.018061 | 0.055040 | 0.071460 | 0.080040 | 0.092080 | 0.207500 |
Mapa de Calor de Correlaciones¶
El mapa de calor nos permite visualizar la matriz de correlacion entre todas las variables continuas. Los colores representan la magnitud y direccion de la correlacion: valores cercanos a 1 (o -1) indican alta correlacion positiva (o negativa), mientras que valores cercanos a 0 sugieren poca relacion lineal.
corrmat = X.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, linewidths=.5, fmt= '.1f',ax=ax);
Interpretacion de Correlaciones y Seleccion de Variables¶
El mapa de calor revela grupos de variables altamente correlacionadas entre si.
Por ejemplo, radius_mean, perimeter_mean y area_mean presentan alta
correlacion mutua, lo cual es esperable dado que las tres describen el tamano
del nucleo celular. De manera similar, compactness_mean, concavity_mean y
concave points_mean estan fuertemente correlacionadas.
Para reducir la multicolinealidad y mejorar la estabilidad de los modelos, seleccionamos un representante de cada grupo de variables correlacionadas, eliminando las redundantes.
drop_list1 = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = X.drop(drop_list1,axis = 1 )
#correlation map
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
<Axes: >
2. Preprocesamiento¶
Aplicamos un escalado MinMaxScaler para normalizar las variables al rango [0, 1]. Esto es particularmente importante para algoritmos sensibles a la escala de las variables, como SVM. Ademas, preparamos los conjuntos de entrenamiento y prueba.
# visualize a minmax scaler transform of the sonar dataset
from pandas import read_csv
from pandas import DataFrame
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot
data = df.values[:, :]
# perform a robust scaler transform of the dataset
trans = MinMaxScaler()
data = trans.fit_transform(data)
# convert the array back to a dataframe
dfbalanceado = DataFrame(data)
dfbalanceado.columns = df.columns
# summarize
print(dfbalanceado.describe())
# histograms of the variables
dfbalanceado.hist()
pyplot.show()
diagnosis radius_mean texture_mean perimeter_mean area_mean \
count 569.000000 569.000000 569.000000 569.000000 569.000000
mean 0.372583 0.338222 0.323965 0.332935 0.216920
std 0.483918 0.166787 0.145453 0.167915 0.149274
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.223342 0.218465 0.216847 0.117413
50% 0.000000 0.302381 0.308759 0.293345 0.172895
75% 1.000000 0.416442 0.408860 0.416765 0.271135
max 1.000000 1.000000 1.000000 1.000000 1.000000
smoothness_mean compactness_mean concavity_mean concave points_mean \
count 569.000000 569.000000 569.000000 569.000000
mean 0.394785 0.260601 0.208058 0.243137
std 0.126967 0.161992 0.186785 0.192857
min 0.000000 0.000000 0.000000 0.000000
25% 0.304595 0.139685 0.069260 0.100944
50% 0.390358 0.224679 0.144189 0.166501
75% 0.475490 0.340531 0.306232 0.367793
max 1.000000 1.000000 1.000000 1.000000
symmetry_mean ... radius_worst texture_worst perimeter_worst \
count 569.000000 ... 569.000000 569.000000 569.000000
mean 0.379605 ... 0.296663 0.363998 0.283138
std 0.138456 ... 0.171940 0.163813 0.167352
min 0.000000 ... 0.000000 0.000000 0.000000
25% 0.282323 ... 0.180719 0.241471 0.167837
50% 0.369697 ... 0.250445 0.356876 0.235320
75% 0.453030 ... 0.386339 0.471748 0.373475
max 1.000000 ... 1.000000 1.000000 1.000000
area_worst smoothness_worst compactness_worst concavity_worst \
count 569.000000 569.000000 569.000000 569.000000
mean 0.170906 0.404138 0.220212 0.217403
std 0.139932 0.150779 0.152649 0.166633
min 0.000000 0.000000 0.000000 0.000000
25% 0.081130 0.300007 0.116337 0.091454
50% 0.123206 0.397081 0.179110 0.181070
75% 0.220901 0.494156 0.302520 0.305831
max 1.000000 1.000000 1.000000 1.000000
concave points_worst symmetry_worst fractal_dimension_worst
count 569.000000 569.000000 569.000000
mean 0.393836 0.263307 0.189596
std 0.225884 0.121954 0.118466
min 0.000000 0.000000 0.000000
25% 0.223127 0.185098 0.107700
50% 0.343402 0.247782 0.163977
75% 0.554639 0.318155 0.242949
max 1.000000 1.000000 1.000000
[8 rows x 31 columns]
#Este código colocamos la variable diagnosis en la última columna por comodidad
diagnosis = dfbalanceado['diagnosis']
dfbalanceado.drop(['diagnosis'], axis=1, inplace=True)
dfbalanceado.insert(30, 'diagnosis', diagnosis)
dfbalanceado.head()
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diagnosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.521037 | 0.022658 | 0.545989 | 0.363733 | 0.593753 | 0.792037 | 0.703140 | 0.731113 | 0.686364 | 0.605518 | ... | 0.141525 | 0.668310 | 0.450698 | 0.601136 | 0.619292 | 0.568610 | 0.912027 | 0.598462 | 0.418864 | 1.0 |
| 1 | 0.643144 | 0.272574 | 0.615783 | 0.501591 | 0.289880 | 0.181768 | 0.203608 | 0.348757 | 0.379798 | 0.141323 | ... | 0.303571 | 0.539818 | 0.435214 | 0.347553 | 0.154563 | 0.192971 | 0.639175 | 0.233590 | 0.222878 | 1.0 |
| 2 | 0.601496 | 0.390260 | 0.595743 | 0.449417 | 0.514309 | 0.431017 | 0.462512 | 0.635686 | 0.509596 | 0.211247 | ... | 0.360075 | 0.508442 | 0.374508 | 0.483590 | 0.385375 | 0.359744 | 0.835052 | 0.403706 | 0.213433 | 1.0 |
| 3 | 0.210090 | 0.360839 | 0.233501 | 0.102906 | 0.811321 | 0.811361 | 0.565604 | 0.522863 | 0.776263 | 1.000000 | ... | 0.385928 | 0.241347 | 0.094008 | 0.915472 | 0.814012 | 0.548642 | 0.884880 | 1.000000 | 0.773711 | 1.0 |
| 4 | 0.629893 | 0.156578 | 0.630986 | 0.489290 | 0.430351 | 0.347893 | 0.463918 | 0.518390 | 0.378283 | 0.186816 | ... | 0.123934 | 0.506948 | 0.341575 | 0.437364 | 0.172415 | 0.319489 | 0.558419 | 0.157500 | 0.142595 | 1.0 |
5 rows × 31 columns
print(dfbalanceado.columns
)
Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
'smoothness_mean', 'compactness_mean', 'concavity_mean',
'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
'fractal_dimension_se', 'radius_worst', 'texture_worst',
'perimeter_worst', 'area_worst', 'smoothness_worst',
'compactness_worst', 'concavity_worst', 'concave points_worst',
'symmetry_worst', 'fractal_dimension_worst', 'diagnosis'],
dtype='str')
print(len(dfbalanceado.columns))
print(len(dfbalanceado
))
31 569
3. Manejo de Desbalance de Clases¶
El dataset presenta un desbalance entre las clases benigno y maligno. Para abordar este problema, evaluamos dos estrategias de remuestreo: sub-muestreo aleatorio (under-sampling) y sobre-muestreo aleatorio (over-sampling).
print('Benigno', round(dfbalanceado['diagnosis'].value_counts()[0]/len(dfbalanceado) * 100,2), '% del dataset')
print('Maligno', round(dfbalanceado['diagnosis'].value_counts()[1]/len(dfbalanceado) * 100,2), '% del dataset')
X = dfbalanceado.drop('diagnosis', axis=1)
y = dfbalanceado['diagnosis']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
#convirtiendo los datos en array
Xtrain = Xtrain.values
Xtest = Xtest.values
ytrain = ytrain.values
ytest = ytest.values
# Viendo si la distribución de las etiqueta de train y test se distribuyen de manera similar
train_unique_label, train_counts_label = np.unique(ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(ytest, return_counts=True)
print('-' * 100)
print('Distribución de las etiquetas: \n')
print(train_counts_label/ len(ytrain))
print(test_counts_label/ len(ytest))
print(len(Xtrain))
print(len(Xtest))
print(len(ytrain))
print(len(ytest))
print(len(dfbalanceado))
print(len(dfbalanceado.columns))
print(len(X))
print(len(y))
Benigno 62.74 % del dataset Maligno 37.26 % del dataset ---------------------------------------------------------------------------------------------------- Distribución de las etiquetas: [0.62857143 0.37142857] [0.62280702 0.37719298] 455 114 455 114 569 31 569 569
3.1 Random Under-Sampling¶
El sub-muestreo aleatorio reduce la clase mayoritaria al tamano de la clase minoritaria, eliminando observaciones de forma aleatoria. Esto equilibra las clases pero puede descartar informacion util.
from imblearn.under_sampling import RandomUnderSampler
# Vamos a mezclar los datos antes de crear las submuestras.
rus = RandomUnderSampler(random_state=0)
rus.fit(Xtrain, ytrain)
X_train_undersampling, y_train_undersampling = rus.fit_resample(Xtrain, ytrain)
df_under_sampling = pd.DataFrame(X_train_undersampling, columns=dfbalanceado.columns[:-1])
df_under_sampling['diagnosis'] = y_train_undersampling
df_under_sampling
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diagnosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.308060 | 0.425769 | 0.297975 | 0.177094 | 0.314977 | 0.176676 | 0.111317 | 0.168191 | 0.378283 | 0.152064 | ... | 0.527719 | 0.241994 | 0.126229 | 0.297365 | 0.139525 | 0.182268 | 0.440550 | 0.257441 | 0.092680 | 0.0 |
| 1 | 0.135643 | 0.201894 | 0.132748 | 0.063499 | 0.381782 | 0.198791 | 0.054592 | 0.120080 | 0.165152 | 0.399115 | ... | 0.292377 | 0.119080 | 0.047016 | 0.467080 | 0.191140 | 0.067364 | 0.224330 | 0.184703 | 0.243015 | 0.0 |
| 2 | 0.239907 | 0.166385 | 0.236680 | 0.129714 | 0.455629 | 0.219434 | 0.154452 | 0.136630 | 0.310606 | 0.220514 | ... | 0.231343 | 0.196574 | 0.097670 | 0.516608 | 0.182699 | 0.243610 | 0.225017 | 0.232998 | 0.183458 | 0.0 |
| 3 | 0.247953 | 0.349341 | 0.246562 | 0.131326 | 0.514309 | 0.293908 | 0.191542 | 0.107654 | 0.537374 | 0.399747 | ... | 0.323827 | 0.172917 | 0.081130 | 0.455854 | 0.198126 | 0.282348 | 0.277938 | 0.225508 | 0.218746 | 0.0 |
| 4 | 0.204411 | 0.286777 | 0.208279 | 0.104305 | 0.390810 | 0.346973 | 0.362699 | 0.141849 | 0.502020 | 0.562974 | ... | 0.424840 | 0.183027 | 0.070709 | 0.419534 | 0.443878 | 0.593930 | 0.418557 | 0.343584 | 0.489702 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 333 | 0.569786 | 0.503213 | 0.540460 | 0.395546 | 0.339984 | 0.310472 | 0.343955 | 0.411083 | 0.451010 | 0.133319 | ... | 0.490139 | 0.510434 | 0.353372 | 0.354817 | 0.284571 | 0.459665 | 0.672165 | 0.471319 | 0.248196 | 1.0 |
| 334 | 0.428274 | 0.196145 | 0.428512 | 0.275589 | 0.381692 | 0.361082 | 0.282099 | 0.349950 | 0.364646 | 0.206403 | ... | 0.265458 | 0.367996 | 0.217460 | 0.477646 | 0.407981 | 0.395847 | 0.680756 | 0.286615 | 0.237439 | 1.0 |
| 335 | 0.341190 | 0.476835 | 0.339161 | 0.198176 | 0.379164 | 0.341145 | 0.261246 | 0.321173 | 0.593434 | 0.302654 | ... | 0.608475 | 0.321679 | 0.153878 | 0.559532 | 0.367329 | 0.299042 | 0.608935 | 0.622708 | 0.311951 | 1.0 |
| 336 | 0.552747 | 0.250592 | 0.536314 | 0.395970 | 0.476393 | 0.277958 | 0.341378 | 0.430666 | 0.457576 | 0.256318 | ... | 0.343284 | 0.473081 | 0.335185 | 0.522552 | 0.195797 | 0.261342 | 0.575258 | 0.261975 | 0.193625 | 1.0 |
| 337 | 0.331251 | 0.335137 | 0.327068 | 0.193425 | 0.481809 | 0.288080 | 0.263824 | 0.321223 | 0.307576 | 0.326032 | ... | 0.500533 | 0.316201 | 0.168133 | 0.595192 | 0.319692 | 0.325000 | 0.627835 | 0.318155 | 0.330972 | 1.0 |
338 rows × 31 columns
3.2 Random Over-Sampling¶
El sobre-muestreo aleatorio duplica observaciones de la clase minoritaria hasta igualar la clase mayoritaria. Esto preserva toda la informacion disponible, aunque puede introducir sobreajuste.
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
ros.fit(Xtrain, ytrain)
X_train_oversampling, y_train_oversampling = ros.fit_resample(Xtrain, ytrain)
df_over_sampling = pd.DataFrame(X_train_oversampling, columns=dfbalanceado.columns[:-1])
df_over_sampling['diagnosis'] = y_train_oversampling
df_over_sampling
| radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | fractal_dimension_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | diagnosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.096928 | 0.257694 | 0.103656 | 0.045387 | 0.487226 | 0.373965 | 0.733365 | 0.217445 | 0.530808 | 0.642376 | ... | 0.283316 | 0.075153 | 0.034285 | 0.508684 | 0.397018 | 1.000000 | 0.601375 | 0.524936 | 0.409681 | 0.0 |
| 1 | 0.667755 | 0.570172 | 0.683505 | 0.495228 | 0.554934 | 0.809214 | 0.582709 | 0.743539 | 0.674242 | 0.505897 | ... | 0.571962 | 0.627970 | 0.467902 | 0.514627 | 0.709327 | 0.541534 | 0.997595 | 0.499310 | 0.481175 | 1.0 |
| 2 | 0.103744 | 0.140345 | 0.106489 | 0.049799 | 0.221901 | 0.208975 | 0.140300 | 0.108350 | 0.646970 | 0.414280 | ... | 0.192164 | 0.075601 | 0.030697 | 0.179555 | 0.136324 | 0.111581 | 0.174811 | 0.338459 | 0.195855 | 0.0 |
| 3 | 0.173648 | 0.524518 | 0.167369 | 0.086320 | 0.396678 | 0.162444 | 0.055740 | 0.080268 | 0.422727 | 0.280750 | ... | 0.617537 | 0.137308 | 0.066482 | 0.519910 | 0.109158 | 0.089856 | 0.210859 | 0.363493 | 0.173357 | 0.0 |
| 4 | 0.150930 | 0.174839 | 0.143459 | 0.071432 | 0.548614 | 0.187811 | 0.025398 | 0.064115 | 0.850000 | 0.413648 | ... | 0.144723 | 0.096867 | 0.045075 | 0.371987 | 0.069244 | 0.017316 | 0.088625 | 0.392667 | 0.165027 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 567 | 0.550381 | 0.356442 | 0.541151 | 0.403181 | 0.377088 | 0.267530 | 0.349110 | 0.384245 | 0.321717 | 0.148062 | ... | 0.406183 | 0.445690 | 0.299302 | 0.413590 | 0.178916 | 0.275240 | 0.512027 | 0.152967 | 0.125738 | 1.0 |
| 568 | 0.593923 | 0.769699 | 0.581922 | 0.457900 | 0.285005 | 0.287160 | 0.268276 | 0.329871 | 0.185859 | 0.066765 | ... | 0.889925 | 0.646397 | 0.563262 | 0.459816 | 0.371016 | 0.319089 | 0.558419 | 0.226296 | 0.135380 | 1.0 |
| 569 | 0.404610 | 0.806561 | 0.414000 | 0.255101 | 0.484517 | 0.443286 | 0.410262 | 0.417445 | 0.520707 | 0.348357 | ... | 1.000000 | 0.377957 | 0.208858 | 0.773493 | 0.513345 | 0.455511 | 0.692096 | 0.383797 | 0.428703 | 1.0 |
| 570 | 0.638885 | 0.397362 | 0.613019 | 0.493107 | 0.279137 | 0.196614 | 0.211856 | 0.299304 | 0.205556 | 0.038121 | ... | 0.377132 | 0.554261 | 0.384585 | 0.340950 | 0.197737 | 0.252236 | 0.496564 | 0.132663 | 0.106454 | 1.0 |
| 571 | 0.397037 | 0.441326 | 0.389814 | 0.248017 | 0.355421 | 0.258328 | 0.262887 | 0.371918 | 0.331818 | 0.231887 | ... | 0.368337 | 0.284327 | 0.158695 | 0.360100 | 0.167273 | 0.227316 | 0.507216 | 0.195348 | 0.086842 | 1.0 |
572 rows × 31 columns
colors = ["#0101DF", "#DF0101"]
sns.countplot(x='diagnosis', data=df_under_sampling, palette=colors)
plt.title('Clases igualmente distribuidas', fontsize=14)
plt.show()
3.3 Matrices de Correlacion Post-Remuestreo¶
Comparamos las matrices de correlacion del dataset completo, el sub-muestreado y el sobre-muestreado para verificar que las relaciones entre variables se mantienen consistentes tras el remuestreo.
# Hay que asegurarse de usar la submuestra en nuestra correlación
f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(24,20))
# DataFrame completo
corr = dfbalanceado.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Matriz de correlación con imbalance \n (no se usa como referencia)", fontsize=14)
sub_sample_corr = df_under_sampling.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('Matriz de correlación usando submuestra \n (usar como referencia)', fontsize=14)
over_sample_corr = df_over_sampling.corr()
sns.heatmap(over_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax3)
ax3.set_title('Matriz de correlación usando sobremuestra \n (usar como referencia)', fontsize=14)
plt.show()
4. Modelamiento y Evaluacion (Under-Sampling)¶
Entrenamos tres clasificadores sobre el conjunto sub-muestreado: Support Vector Classifier (SVC), Decision Tree y Naive Bayes. Para cada uno, realizamos validacion cruzada y optimizacion de hiperparametros mediante GridSearchCV.
# Se obtiene X e y a partir del nuevo dataset
X_train_undersampling = df_under_sampling.drop('diagnosis', axis=1)
y_train_undersampling = df_under_sampling['diagnosis']
classifiers = {
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
"Naive_bayes": GaussianNB()
}
from sklearn.model_selection import cross_val_score
for key, classifier in classifiers.items():
classifier.fit(X_train_undersampling, y_train_undersampling)
training_score = cross_val_score(classifier, X_train_undersampling, y_train_undersampling, cv=5)
print("Clasificador: ", classifier.__class__.__name__, "tiene una precisión del", round(training_score.mean(), 2) * 100, "%")
Clasificador: SVC tiene una precisión del 97.0 % Clasificador: DecisionTreeClassifier tiene una precisión del 90.0 % Clasificador: GaussianNB tiene una precisión del 92.0 %
#GridSearchCV para encontrar los mejores hiperparámetros
from sklearn.model_selection import GridSearchCV
# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train_undersampling, y_train_undersampling)
# mejor modelo SVC
svc = grid_svc.best_estimator_
# DecisionTree
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)),
"min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train_undersampling, y_train_undersampling)
# mejor modelo árbol de decisión
tree_clf = grid_tree.best_estimator_
# Naive
naive_clf = GaussianNB()
naive_clf.fit(X_train_undersampling, y_train_undersampling)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
svc
SVC(C=0.7, kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
tree_clf
DecisionTreeClassifier(max_depth=2, min_samples_leaf=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
svc_score = cross_val_score(svc, X_train_undersampling, y_train_undersampling, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')
tree_score = cross_val_score(tree_clf, X_train_undersampling, y_train_undersampling, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')
naive_score = cross_val_score(naive_clf, X_train_undersampling, y_train_undersampling, cv=5)
print('Naive Bayes Classifier Cross Validation Score', round(naive_score.mean() * 100, 2).astype(str) + '%')
Support Vector Classifier Cross Validation Score 97.63% DecisionTree Classifier Cross Validation Score 92.58% Naive Bayes Classifier Cross Validation Score 91.99%
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Crea un DataFrame con todos los puntajes y los nombres de los clasificadores.
svc_pred = cross_val_predict(svc, X_train_undersampling, y_train_undersampling, cv=5,
method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train_undersampling, y_train_undersampling, cv=5)
naive_pred = cross_val_predict(naive_clf, X_train_undersampling, y_train_undersampling, cv=5)
from sklearn.metrics import roc_auc_score
print('Support Vector Classifier: ', roc_auc_score(y_train_undersampling, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train_undersampling, tree_pred))
print('Naive Bayes Tree Classifier: ', roc_auc_score(y_train_undersampling, naive_pred))
Support Vector Classifier: 0.992297188473793 Decision Tree Classifier: 0.9289940828402368 Naive Bayes Tree Classifier: 0.9201183431952662
4.1 Matriz de Confusion y Metricas (Under-Sampling)¶
Evaluamos el desempeno de los modelos entrenados con under-sampling sobre el conjunto de prueba. La matriz de confusion nos permite visualizar los verdaderos positivos, falsos positivos, verdaderos negativos y falsos negativos de cada clasificador.
df_test = pd.DataFrame(Xtest, columns=dfbalanceado.columns[:-1])
df_test['diagnosis'] = ytest
print('Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING')
print(df_test['diagnosis'].value_counts())
sns.countplot(x='diagnosis', data=df_test, palette=colors)
plt.title('Clases igualmente distribuidas', fontsize=14)
plt.show()
Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING diagnosis 0.0 71 1.0 43 Name: count, dtype: int64
from sklearn.metrics import confusion_matrix
y_pred_svc = svc.predict(Xtest)
y_pred_tree = tree_clf.predict(Xtest)
y_pred_naive = naive_clf.predict(Xtest)
svc_cf = confusion_matrix(ytest, y_pred_svc)
tree_cf = confusion_matrix(ytest, y_pred_tree)
naive_cf = confusion_matrix(ytest, y_pred_naive)
fig, ax = plt.subplots(2, 2,figsize=(22,12))
sns.heatmap(naive_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("Naive Bayes \n Matriz de Confusión", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Suppor Vector Machine \n Matriz de Confusión", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(tree_cf, ax=ax[1][1], annot=True, cmap=plt.cm.copper)
ax[1][1].set_title("DecisionTree \n Matriz de Confusión", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)
plt.show()
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['Benigno', 'Maligno']
print("Reporte clasificación SVC")
print(classification_report(ytest, y_pred_svc, target_names=target_names))
print("")
print("Reporte clasificación Árbol")
print(classification_report(ytest, y_pred_tree, target_names=target_names))
print("")
print("Reporte clasificación Naive Bayes")
print(classification_report(ytest, y_pred_naive, target_names=target_names))
Reporte clasificación SVC
precision recall f1-score support
Benigno 0.97 0.99 0.98 71
Maligno 0.98 0.95 0.96 43
accuracy 0.97 114
macro avg 0.97 0.97 0.97 114
weighted avg 0.97 0.97 0.97 114
Reporte clasificación Árbol
precision recall f1-score support
Benigno 0.97 0.94 0.96 71
Maligno 0.91 0.95 0.93 43
accuracy 0.95 114
macro avg 0.94 0.95 0.94 114
weighted avg 0.95 0.95 0.95 114
Reporte clasificación Naive Bayes
precision recall f1-score support
Benigno 0.96 0.99 0.97 71
Maligno 0.98 0.93 0.95 43
accuracy 0.96 114
macro avg 0.97 0.96 0.96 114
weighted avg 0.97 0.96 0.96 114
5. Modelamiento y Evaluacion (Over-Sampling)¶
Repetimos el proceso de entrenamiento y evaluacion utilizando el conjunto sobre-muestreado. Esto nos permite comparar directamente el efecto de cada estrategia de balanceo sobre el rendimiento de los modelos.
# Se obtiene X e y a partir del nuevo dataset
df_over_sampling = df_over_sampling.sample(frac = 1)
df_over_sampling_2 = df_over_sampling.iloc[:50000]
X_train_oversampling = df_over_sampling_2.drop('diagnosis', axis=1)
y_train_oversampling = df_over_sampling_2['diagnosis']
classifiers = {
"Support Vector Classifier": SVC(),
"DecisionTreeClassifier": DecisionTreeClassifier(),
"Naive_bayes": GaussianNB()
}
for key, classifier in classifiers.items():
classifier.fit(X_train_oversampling, y_train_oversampling)
training_score = cross_val_score(classifier, X_train_oversampling, y_train_oversampling, cv=5)
print("Clasificador: ", classifier.__class__.__name__, "tiene una precisión del", round(training_score.mean(), 2) * 100, "%")
Clasificador: SVC tiene una precisión del 98.0 % Clasificador: DecisionTreeClassifier tiene una precisión del 94.0 % Clasificador: GaussianNB tiene una precisión del 93.0 %
#GridSearchCV para encontrar los mejores hiperparámetros
# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train_oversampling, y_train_oversampling)
# mejor modelo SVC
svc = grid_svc.best_estimator_
# DecisionTree
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)),
"min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train_oversampling, y_train_oversampling)
# mejor modelo árbol de decisión
tree_clf = grid_tree.best_estimator_
# Naive
naive_clf = GaussianNB()
naive_clf.fit(X_train_oversampling, y_train_oversampling)
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
svc
SVC(C=0.7, kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
tree_clf
DecisionTreeClassifier(max_depth=3, min_samples_leaf=6)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
svc_score = cross_val_score(svc, X_train_oversampling, y_train_oversampling, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')
tree_score = cross_val_score(tree_clf, X_train_oversampling, y_train_oversampling, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')
naive_score = cross_val_score(naive_clf, X_train_oversampling, y_train_oversampling, cv=5)
print('Naive Bayes Classifier Cross Validation Score', round(naive_score.mean() * 100, 2).astype(str) + '%')
Support Vector Classifier Cross Validation Score 97.9% DecisionTree Classifier Cross Validation Score 92.84% Naive Bayes Classifier Cross Validation Score 92.83%
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Crea un DataFrame con todos los puntajes y los nombres de los clasificadores.
svc_pred = cross_val_predict(svc, X_train_oversampling, y_train_oversampling, cv=5,
method="decision_function")
tree_pred = cross_val_predict(tree_clf, X_train_oversampling, y_train_oversampling, cv=5)
naive_pred = cross_val_predict(naive_clf, X_train_oversampling, y_train_oversampling, cv=5)
from sklearn.metrics import roc_auc_score
print('Support Vector Classifier: ', roc_auc_score(y_train_oversampling, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train_oversampling, tree_pred))
print('Naive Bayes Tree Classifier: ', roc_auc_score(y_train_oversampling, naive_pred))
Support Vector Classifier: 0.9936427209154481 Decision Tree Classifier: 0.9318181818181819 Naive Bayes Tree Classifier: 0.9283216783216784
6. Comparacion Final¶
Evaluamos los modelos entrenados con over-sampling sobre el conjunto de prueba y comparamos las matrices de confusion y metricas de clasificacion con los resultados obtenidos mediante under-sampling. Las curvas ROC nos permiten visualizar el trade-off entre sensibilidad y especificidad para cada modelo.
df_test = pd.DataFrame(Xtest, columns=dfbalanceado.columns[:-1])
df_test['diagnosis'] = ytest
print('Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING')
print(df_test['diagnosis'].value_counts())
sns.countplot(x='diagnosis', data=df_test, palette=colors)
plt.title('Clases igualmente distribuidas', fontsize=14)
plt.show()
Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING diagnosis 0.0 71 1.0 43 Name: count, dtype: int64
from sklearn.metrics import confusion_matrix
y_pred_svc = svc.predict(Xtest)
y_pred_tree = tree_clf.predict(Xtest)
y_pred_naive = naive_clf.predict(Xtest)
svc_cf = confusion_matrix(ytest, y_pred_svc)
tree_cf = confusion_matrix(ytest, y_pred_tree)
naive_cf = confusion_matrix(ytest, y_pred_naive)
fig, ax = plt.subplots(2, 2,figsize=(22,12))
sns.heatmap(naive_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("Naive Bayes \n Matriz de Confusión", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Suppor Vector Machine \n Matriz de Confusión", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)
sns.heatmap(tree_cf, ax=ax[1][1], annot=True, cmap=plt.cm.copper)
ax[1][1].set_title("DecisionTree \n Matriz de Confusión", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)
plt.show()
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['Benigno', 'Maligno']
print("Reporte clasificación SVC")
print(classification_report(ytest, y_pred_svc, target_names=target_names))
print("")
print("Reporte clasificación Árbol")
print(classification_report(ytest, y_pred_tree, target_names=target_names))
print("")
print("Reporte clasificación Naive Bayes")
print(classification_report(ytest, y_pred_naive, target_names=target_names))
Reporte clasificación SVC
precision recall f1-score support
Benigno 0.97 0.99 0.98 71
Maligno 0.98 0.95 0.96 43
accuracy 0.97 114
macro avg 0.97 0.97 0.97 114
weighted avg 0.97 0.97 0.97 114
Reporte clasificación Árbol
precision recall f1-score support
Benigno 0.97 0.97 0.97 71
Maligno 0.95 0.95 0.95 43
accuracy 0.96 114
macro avg 0.96 0.96 0.96 114
weighted avg 0.96 0.96 0.96 114
Reporte clasificación Naive Bayes
precision recall f1-score support
Benigno 0.96 0.99 0.97 71
Maligno 0.98 0.93 0.95 43
accuracy 0.96 114
macro avg 0.97 0.96 0.96 114
weighted avg 0.97 0.96 0.96 114
svm_fpr,svm_tpr, trhreshold = roc_curve(ytest, y_pred_svc)
plt.plot(svm_fpr,svm_tpr, linestyle='-')
[<matplotlib.lines.Line2D at 0x7f3bf11ee510>]
svm_fpr,svm_tpr, trhreshold = roc_curve(ytest,y_pred_tree )
plt.plot(svm_fpr,svm_tpr, linestyle='-')
[<matplotlib.lines.Line2D at 0x7f3bf0d5f790>]
svm_fpr,svm_tpr, trhreshold = roc_curve(ytest,y_pred_naive )
plt.plot(svm_fpr,svm_tpr, linestyle='-')
[<matplotlib.lines.Line2D at 0x7f3bf0a25490>]
from sklearn.metrics import RocCurveDisplay
fig, ax = plt.subplots(figsize=(8, 6))
for name, clf in [("SVC", svc), ("Decision Tree", tree_clf), ("Naive Bayes", naive_clf)]:
RocCurveDisplay.from_estimator(clf, Xtest, ytest, ax=ax, name=name)
ax.set_title("ROC Curves - Comparación de Clasificadores")
plt.show()