import seaborn as sns
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time

# Librerias de clasificadores
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import collections


# Otras Librerias
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats
import xgboost as xgb
from sklearn.model_selection import KFold
from IPython.display import HTML, display
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/data.csv')

# Convertimos diagnosis de B/M a 0/1
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})
df['diagnosis'].value_counts()

diagnosis
0    357
1    212
Name: count, dtype: int64

# definimos "y" como la variable diagnostico 
#definimos a "X" como el resto de variables sin diagnostico, unnamed:32, id

y = df.diagnosis # M or B 
list_drp = ['Unnamed: 32','id','diagnosis']
X = df.drop(list_drp,axis = 1 )

#cols_with_missing = [col for col in train.columns if train[col].isnull().any()]
cols_with_missing = df.isnull().sum()
cols_with_missing = cols_with_missing[cols_with_missing>0]
cols_with_missing.sort_values(inplace=True)
fig, ax = plt.subplots(figsize=(7,6))  
width = 0.70 # the width of the bars 
ind = np.arange(len(cols_with_missing))  # the x locations for the groups
ax.barh(ind, cols_with_missing, width, color="blue")
ax.set_yticks(ind+width/2)
ax.set_yticklabels(cols_with_missing.index, minor=False)
plt.xlabel('Count')
plt.ylabel('Features')

Text(0, 0.5, 'Features')

#vemos que unnamed esta vacía, por lo que la sacamos de la base de datos
lista=['Unnamed: 32','id']
df=df.drop(lista,axis = 1 )

# definimos "y" como la variable diagnostico 
#definimos a "X" como el resto de variables sin diagnostico, unnamed:32, id

y = df.diagnosis # M or B 
list_drp = ['diagnosis']
X = df.drop(list_drp,axis = 1 )





print('Los casos benignos son el', round(df['diagnosis'].value_counts()[0]/len(df) * 100,2), '% del dataset')
print('Los casos malignos son el', round(df['diagnosis'].value_counts()[1]/len(df) * 100,2), '% del dataset')
#graficamos el número de caos benignos y malignos, y de esta forma verificar si la clase es balanceada
ax = sns.countplot(y,label="Count")       # M = 212, B = 357
B, M = y.value_counts()
print('Número de benignos: ',B)
print('Número de malignos : ',M)
ax.set_ylabel('Número de pacientes')
bars = ax.patches
half = int(len(bars)/2)
left_bars = bars[:half]
right_bars = bars[half:]
for left, right in zip(left_bars, right_bars):
    height_l = left.get_height()
    height_r = right.get_height()
    total = height_l + height_r
    ax.text(left.get_x() + left.get_width()/2., height_l + 40, '{0:.0%}'.format(height_l/total), ha="center")
    ax.text(right.get_x() + right.get_width()/2., height_r + 40, '{0:.0%}'.format(height_r/total), ha="center")

Los casos benignos son el 62.74 % del dataset
Los casos malignos son el 37.26 % del dataset

Número de benignos:  357
Número de malignos :  212

# Vemos si existen registros duplicados
dups = X.duplicated()
# hacemos print de los registros duplicados si es que existen
print(dups.any())
print(X[dups])

False
Empty DataFrame
Columns: [radius_mean, texture_mean, perimeter_mean, area_mean, smoothness_mean, compactness_mean, concavity_mean, concave points_mean, symmetry_mean, fractal_dimension_mean, radius_se, texture_se, perimeter_se, area_se, smoothness_se, compactness_se, concavity_se, concave points_se, symmetry_se, fractal_dimension_se, radius_worst, texture_worst, perimeter_worst, area_worst, smoothness_worst, compactness_worst, concavity_worst, concave points_worst, symmetry_worst, fractal_dimension_worst]
Index: []

[0 rows x 30 columns]

#Hacemos un gráfico de las variables para ver las estadísticas descriptivas de cada una
#también nos permite ver que variables estan 
X.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

corrmat = X.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, linewidths=.5, fmt= '.1f',ax=ax);

drop_list1 = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se','perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst','compactness_se','concave points_se','texture_worst','area_worst']
x_1 = X.drop(drop_list1,axis = 1 ) 
#correlation map
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)

<Axes: >

# visualize a minmax scaler transform of the sonar dataset
from pandas import read_csv
from pandas import DataFrame
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot

data = df.values[:, :]
# perform a robust scaler transform of the dataset
trans = MinMaxScaler()
data = trans.fit_transform(data)
# convert the array back to a dataframe
dfbalanceado = DataFrame(data)
dfbalanceado.columns = df.columns
# summarize
print(dfbalanceado.describe())
# histograms of the variables
dfbalanceado.hist()
pyplot.show()

        diagnosis  radius_mean  texture_mean  perimeter_mean   area_mean  \
count  569.000000   569.000000    569.000000      569.000000  569.000000   
mean     0.372583     0.338222      0.323965        0.332935    0.216920   
std      0.483918     0.166787      0.145453        0.167915    0.149274   
min      0.000000     0.000000      0.000000        0.000000    0.000000   
25%      0.000000     0.223342      0.218465        0.216847    0.117413   
50%      0.000000     0.302381      0.308759        0.293345    0.172895   
75%      1.000000     0.416442      0.408860        0.416765    0.271135   
max      1.000000     1.000000      1.000000        1.000000    1.000000   

       smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.394785          0.260601        0.208058             0.243137   
std           0.126967          0.161992        0.186785             0.192857   
min           0.000000          0.000000        0.000000             0.000000   
25%           0.304595          0.139685        0.069260             0.100944   
50%           0.390358          0.224679        0.144189             0.166501   
75%           0.475490          0.340531        0.306232             0.367793   
max           1.000000          1.000000        1.000000             1.000000   

       symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
count     569.000000  ...    569.000000     569.000000       569.000000   
mean        0.379605  ...      0.296663       0.363998         0.283138   
std         0.138456  ...      0.171940       0.163813         0.167352   
min         0.000000  ...      0.000000       0.000000         0.000000   
25%         0.282323  ...      0.180719       0.241471         0.167837   
50%         0.369697  ...      0.250445       0.356876         0.235320   
75%         0.453030  ...      0.386339       0.471748         0.373475   
max         1.000000  ...      1.000000       1.000000         1.000000   

       area_worst  smoothness_worst  compactness_worst  concavity_worst  \
count  569.000000        569.000000         569.000000       569.000000   
mean     0.170906          0.404138           0.220212         0.217403   
std      0.139932          0.150779           0.152649         0.166633   
min      0.000000          0.000000           0.000000         0.000000   
25%      0.081130          0.300007           0.116337         0.091454   
50%      0.123206          0.397081           0.179110         0.181070   
75%      0.220901          0.494156           0.302520         0.305831   
max      1.000000          1.000000           1.000000         1.000000   

       concave points_worst  symmetry_worst  fractal_dimension_worst  
count            569.000000      569.000000               569.000000  
mean               0.393836        0.263307                 0.189596  
std                0.225884        0.121954                 0.118466  
min                0.000000        0.000000                 0.000000  
25%                0.223127        0.185098                 0.107700  
50%                0.343402        0.247782                 0.163977  
75%                0.554639        0.318155                 0.242949  
max                1.000000        1.000000                 1.000000  

[8 rows x 31 columns]

#Este código colocamos la variable diagnosis en la última columna por comodidad
diagnosis = dfbalanceado['diagnosis']

dfbalanceado.drop(['diagnosis'], axis=1, inplace=True)
dfbalanceado.insert(30, 'diagnosis', diagnosis)


dfbalanceado.head()

print(dfbalanceado.columns
      )

Index(['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'diagnosis'],
      dtype='str')

print(len(dfbalanceado.columns))
print(len(dfbalanceado
          ))

31
569

print('Benigno', round(dfbalanceado['diagnosis'].value_counts()[0]/len(dfbalanceado) * 100,2), '% del dataset')
print('Maligno', round(dfbalanceado['diagnosis'].value_counts()[1]/len(dfbalanceado) * 100,2), '% del dataset')

X = dfbalanceado.drop('diagnosis', axis=1)
y = dfbalanceado['diagnosis']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

#convirtiendo los datos en array
Xtrain = Xtrain.values
Xtest = Xtest.values
ytrain = ytrain.values
ytest = ytest.values

# Viendo si la distribución de las etiqueta de train y test se distribuyen de manera similar
train_unique_label, train_counts_label = np.unique(ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(ytest, return_counts=True)
print('-' * 100)

print('Distribución de las etiquetas: \n')
print(train_counts_label/ len(ytrain))
print(test_counts_label/ len(ytest))
print(len(Xtrain))
print(len(Xtest))
print(len(ytrain))
print(len(ytest))
print(len(dfbalanceado))
print(len(dfbalanceado.columns))
print(len(X))
print(len(y))

Benigno 62.74 % del dataset
Maligno 37.26 % del dataset
----------------------------------------------------------------------------------------------------
Distribución de las etiquetas: 

[0.62857143 0.37142857]
[0.62280702 0.37719298]
455
114
455
114
569
31
569
569

from imblearn.under_sampling import RandomUnderSampler

# Vamos a mezclar los datos antes de crear las submuestras.

rus = RandomUnderSampler(random_state=0)
rus.fit(Xtrain, ytrain)
X_train_undersampling, y_train_undersampling = rus.fit_resample(Xtrain, ytrain)

df_under_sampling = pd.DataFrame(X_train_undersampling, columns=dfbalanceado.columns[:-1])
df_under_sampling['diagnosis'] = y_train_undersampling
df_under_sampling

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
ros.fit(Xtrain, ytrain)
X_train_oversampling, y_train_oversampling = ros.fit_resample(Xtrain, ytrain)

df_over_sampling = pd.DataFrame(X_train_oversampling, columns=dfbalanceado.columns[:-1])
df_over_sampling['diagnosis'] = y_train_oversampling
df_over_sampling

colors = ["#0101DF", "#DF0101"]
sns.countplot(x='diagnosis', data=df_under_sampling, palette=colors)
plt.title('Clases igualmente distribuidas', fontsize=14)
plt.show()

# Hay que asegurarse de usar la submuestra en nuestra correlación
f, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(24,20))

# DataFrame completo
corr = dfbalanceado.corr()
sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax1)
ax1.set_title("Matriz de correlación con imbalance \n (no se usa como referencia)", fontsize=14)

sub_sample_corr = df_under_sampling.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax2)
ax2.set_title('Matriz de correlación usando submuestra \n (usar como referencia)', fontsize=14)

over_sample_corr = df_over_sampling.corr()
sns.heatmap(over_sample_corr, cmap='coolwarm_r', annot_kws={'size':20}, ax=ax3)
ax3.set_title('Matriz de correlación usando sobremuestra \n (usar como referencia)', fontsize=14)
plt.show()

# Se obtiene X e y a partir del nuevo dataset
X_train_undersampling = df_under_sampling.drop('diagnosis', axis=1)
y_train_undersampling = df_under_sampling['diagnosis']

classifiers = {
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "Naive_bayes": GaussianNB()
}

from sklearn.model_selection import cross_val_score

for key, classifier in classifiers.items():
    classifier.fit(X_train_undersampling, y_train_undersampling)
    training_score = cross_val_score(classifier, X_train_undersampling, y_train_undersampling, cv=5)
    print("Clasificador: ", classifier.__class__.__name__, "tiene una precisión del", round(training_score.mean(), 2) * 100, "%")

Clasificador:  SVC tiene una precisión del 97.0 %
Clasificador:  DecisionTreeClassifier tiene una precisión del 91.0 %
Clasificador:  GaussianNB tiene una precisión del 92.0 %

#GridSearchCV para encontrar los mejores hiperparámetros

from sklearn.model_selection import GridSearchCV

# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train_undersampling, y_train_undersampling)

# mejor modelo SVC  
svc = grid_svc.best_estimator_

# DecisionTree 
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train_undersampling, y_train_undersampling)

# mejor modelo árbol de decisión  
tree_clf = grid_tree.best_estimator_

# Naive 
naive_clf = GaussianNB()
naive_clf.fit(X_train_undersampling, y_train_undersampling)

GaussianNB()

svc

SVC(C=0.7, kernel='linear')

tree_clf

DecisionTreeClassifier(max_depth=2, min_samples_leaf=5)

svc_score = cross_val_score(svc, X_train_undersampling, y_train_undersampling, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')

tree_score = cross_val_score(tree_clf, X_train_undersampling, y_train_undersampling, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

naive_score = cross_val_score(naive_clf, X_train_undersampling, y_train_undersampling, cv=5)
print('Naive Bayes Classifier Cross Validation Score', round(naive_score.mean() * 100, 2).astype(str) + '%')

Support Vector Classifier Cross Validation Score 97.63%
DecisionTree Classifier Cross Validation Score 92.28%
Naive Bayes Classifier Cross Validation Score 91.99%

from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Crea un DataFrame con todos los puntajes y los nombres de los clasificadores.

svc_pred = cross_val_predict(svc, X_train_undersampling, y_train_undersampling, cv=5,
                             method="decision_function")

tree_pred = cross_val_predict(tree_clf, X_train_undersampling, y_train_undersampling, cv=5)

naive_pred = cross_val_predict(naive_clf, X_train_undersampling, y_train_undersampling, cv=5)

from sklearn.metrics import roc_auc_score

print('Support Vector Classifier: ', roc_auc_score(y_train_undersampling, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train_undersampling, tree_pred))
print('Naive Bayes Tree Classifier: ', roc_auc_score(y_train_undersampling, naive_pred))

Support Vector Classifier:  0.992297188473793
Decision Tree Classifier:  0.9260355029585798
Naive Bayes Tree Classifier:  0.9201183431952662

df_test = pd.DataFrame(Xtest, columns=dfbalanceado.columns[:-1])
df_test['diagnosis'] = ytest


print('Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING')
print(df_test['diagnosis'].value_counts())



sns.countplot(x='diagnosis', data=df_test, palette=colors)
plt.title('Clases igualmente distribuidas', fontsize=14)
plt.show()

Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING
diagnosis
0.0    71
1.0    43
Name: count, dtype: int64

from sklearn.metrics import confusion_matrix

y_pred_svc = svc.predict(Xtest)
y_pred_tree = tree_clf.predict(Xtest)
y_pred_naive = naive_clf.predict(Xtest)


svc_cf = confusion_matrix(ytest, y_pred_svc)
tree_cf = confusion_matrix(ytest, y_pred_tree)
naive_cf = confusion_matrix(ytest, y_pred_naive)

fig, ax = plt.subplots(2, 2,figsize=(22,12))


sns.heatmap(naive_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("Naive Bayes \n Matriz de Confusión", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Suppor Vector Machine \n Matriz de Confusión", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(tree_cf, ax=ax[1][1], annot=True, cmap=plt.cm.copper)
ax[1][1].set_title("DecisionTree \n Matriz de Confusión", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)


plt.show()

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['Benigno', 'Maligno']
print("Reporte clasificación SVC")
print(classification_report(ytest, y_pred_svc, target_names=target_names))
print("")
print("Reporte clasificación Árbol")
print(classification_report(ytest, y_pred_tree, target_names=target_names))
print("")
print("Reporte clasificación Naive Bayes")
print(classification_report(ytest, y_pred_naive, target_names=target_names))

Reporte clasificación SVC
              precision    recall  f1-score   support

     Benigno       0.97      0.99      0.98        71
     Maligno       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Reporte clasificación Árbol
              precision    recall  f1-score   support

     Benigno       0.97      0.94      0.96        71
     Maligno       0.91      0.95      0.93        43

    accuracy                           0.95       114
   macro avg       0.94      0.95      0.94       114
weighted avg       0.95      0.95      0.95       114


Reporte clasificación Naive Bayes
              precision    recall  f1-score   support

     Benigno       0.96      0.99      0.97        71
     Maligno       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

# Se obtiene X e y a partir del nuevo dataset
df_over_sampling = df_over_sampling.sample(frac = 1)
df_over_sampling_2 = df_over_sampling.iloc[:50000]

X_train_oversampling = df_over_sampling_2.drop('diagnosis', axis=1)
y_train_oversampling = df_over_sampling_2['diagnosis']

classifiers = {
    "Support Vector Classifier": SVC(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "Naive_bayes": GaussianNB()
}

for key, classifier in classifiers.items():
    classifier.fit(X_train_oversampling, y_train_oversampling)
    training_score = cross_val_score(classifier, X_train_oversampling, y_train_oversampling, cv=5)
    print("Clasificador: ", classifier.__class__.__name__, "tiene una precisión del", round(training_score.mean(), 2) * 100, "%")

Clasificador:  SVC tiene una precisión del 98.0 %
Clasificador:  DecisionTreeClassifier tiene una precisión del 95.0 %
Clasificador:  GaussianNB tiene una precisión del 92.0 %

#GridSearchCV para encontrar los mejores hiperparámetros

# Support Vector Classifier
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train_oversampling, y_train_oversampling)

# mejor modelo SVC  
svc = grid_svc.best_estimator_

# DecisionTree 
tree_params = {"criterion": ["gini", "entropy"], "max_depth": list(range(2,4,1)), 
              "min_samples_leaf": list(range(5,7,1))}
grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params)
grid_tree.fit(X_train_oversampling, y_train_oversampling)

# mejor modelo árbol de decisión  
tree_clf = grid_tree.best_estimator_

# Naive 
naive_clf = GaussianNB()
naive_clf.fit(X_train_oversampling, y_train_oversampling)

GaussianNB()

svc

SVC(C=0.9)

tree_clf

DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=6)

svc_score = cross_val_score(svc, X_train_oversampling, y_train_oversampling, cv=5)
print('Support Vector Classifier Cross Validation Score', round(svc_score.mean() * 100, 2).astype(str) + '%')

tree_score = cross_val_score(tree_clf, X_train_oversampling, y_train_oversampling, cv=5)
print('DecisionTree Classifier Cross Validation Score', round(tree_score.mean() * 100, 2).astype(str) + '%')

naive_score = cross_val_score(naive_clf, X_train_oversampling, y_train_oversampling, cv=5)
print('Naive Bayes Classifier Cross Validation Score', round(naive_score.mean() * 100, 2).astype(str) + '%')

Support Vector Classifier Cross Validation Score 98.25%
DecisionTree Classifier Cross Validation Score 94.42%
Naive Bayes Classifier Cross Validation Score 92.48%

from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_predict
# Crea un DataFrame con todos los puntajes y los nombres de los clasificadores.

svc_pred = cross_val_predict(svc, X_train_oversampling, y_train_oversampling, cv=5,
                             method="decision_function")

tree_pred = cross_val_predict(tree_clf, X_train_oversampling, y_train_oversampling, cv=5)

naive_pred = cross_val_predict(naive_clf, X_train_oversampling, y_train_oversampling, cv=5)

from sklearn.metrics import roc_auc_score

print('Support Vector Classifier: ', roc_auc_score(y_train_oversampling, svc_pred))
print('Decision Tree Classifier: ', roc_auc_score(y_train_oversampling, tree_pred))
print('Naive Bayes Tree Classifier: ', roc_auc_score(y_train_oversampling, naive_pred))

Support Vector Classifier:  0.997016969044941
Decision Tree Classifier:  0.9423076923076923
Naive Bayes Tree Classifier:  0.9248251748251749

df_test = pd.DataFrame(Xtest, columns=dfbalanceado.columns[:-1])
df_test['diagnosis'] = ytest


print('Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING')
print(df_test['diagnosis'].value_counts())



sns.countplot(x='diagnosis', data=df_test, palette=colors)
plt.title('Clases igualmente distribuidas', fontsize=14)
plt.show()

Distribución de las clases en el conjunto de datos de submuestra - OVERSAMPLING
diagnosis
0.0    71
1.0    43
Name: count, dtype: int64

from sklearn.metrics import confusion_matrix

y_pred_svc = svc.predict(Xtest)
y_pred_tree = tree_clf.predict(Xtest)
y_pred_naive = naive_clf.predict(Xtest)


svc_cf = confusion_matrix(ytest, y_pred_svc)
tree_cf = confusion_matrix(ytest, y_pred_tree)
naive_cf = confusion_matrix(ytest, y_pred_naive)

fig, ax = plt.subplots(2, 2,figsize=(22,12))


sns.heatmap(naive_cf, ax=ax[0][1], annot=True, cmap=plt.cm.copper)
ax[0][1].set_title("Naive Bayes \n Matriz de Confusión", fontsize=14)
ax[0][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[0][1].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(svc_cf, ax=ax[1][0], annot=True, cmap=plt.cm.copper)
ax[1][0].set_title("Suppor Vector Machine \n Matriz de Confusión", fontsize=14)
ax[1][0].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][0].set_yticklabels(['', ''], fontsize=14, rotation=360)

sns.heatmap(tree_cf, ax=ax[1][1], annot=True, cmap=plt.cm.copper)
ax[1][1].set_title("DecisionTree \n Matriz de Confusión", fontsize=14)
ax[1][1].set_xticklabels(['', ''], fontsize=14, rotation=90)
ax[1][1].set_yticklabels(['', ''], fontsize=14, rotation=360)


plt.show()

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['Benigno', 'Maligno']
print("Reporte clasificación SVC")
print(classification_report(ytest, y_pred_svc, target_names=target_names))
print("")
print("Reporte clasificación Árbol")
print(classification_report(ytest, y_pred_tree, target_names=target_names))
print("")
print("Reporte clasificación Naive Bayes")
print(classification_report(ytest, y_pred_naive, target_names=target_names))

Reporte clasificación SVC
              precision    recall  f1-score   support

     Benigno       0.97      0.99      0.98        71
     Maligno       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Reporte clasificación Árbol
              precision    recall  f1-score   support

     Benigno       0.95      1.00      0.97        71
     Maligno       1.00      0.91      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114


Reporte clasificación Naive Bayes
              precision    recall  f1-score   support

     Benigno       0.96      0.99      0.97        71
     Maligno       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

svm_fpr,svm_tpr, trhreshold = roc_curve(ytest, y_pred_svc)
plt.plot(svm_fpr,svm_tpr, linestyle='-')

[<matplotlib.lines.Line2D at 0x7ffa35df6c50>]

svm_fpr,svm_tpr, trhreshold = roc_curve(ytest,y_pred_tree )
plt.plot(svm_fpr,svm_tpr, linestyle='-')

[<matplotlib.lines.Line2D at 0x7ffa36315250>]

svm_fpr,svm_tpr, trhreshold = roc_curve(ytest,y_pred_naive )
plt.plot(svm_fpr,svm_tpr, linestyle='-')

[<matplotlib.lines.Line2D at 0x7ffa3639ab50>]

from sklearn.metrics import RocCurveDisplay

fig, ax = plt.subplots(figsize=(8, 6))
for name, clf in [("SVC", svc), ("Decision Tree", tree_clf), ("Naive Bayes", naive_clf)]:
    RocCurveDisplay.from_estimator(clf, Xtest, ytest, ax=ax, name=name)
ax.set_title("ROC Curves - Comparación de Clasificadores")
plt.show()

	count	mean	std	min	25%	50%	75%	max
radius_mean	569.000000	14.127292	3.524049	6.981000	11.700000	13.370000	15.780000	28.110000
texture_mean	569.000000	19.289649	4.301036	9.710000	16.170000	18.840000	21.800000	39.280000
perimeter_mean	569.000000	91.969033	24.298981	43.790000	75.170000	86.240000	104.100000	188.500000
area_mean	569.000000	654.889104	351.914129	143.500000	420.300000	551.100000	782.700000	2501.000000
smoothness_mean	569.000000	0.096360	0.014064	0.052630	0.086370	0.095870	0.105300	0.163400
compactness_mean	569.000000	0.104341	0.052813	0.019380	0.064920	0.092630	0.130400	0.345400
concavity_mean	569.000000	0.088799	0.079720	0.000000	0.029560	0.061540	0.130700	0.426800
concave points_mean	569.000000	0.048919	0.038803	0.000000	0.020310	0.033500	0.074000	0.201200
symmetry_mean	569.000000	0.181162	0.027414	0.106000	0.161900	0.179200	0.195700	0.304000
fractal_dimension_mean	569.000000	0.062798	0.007060	0.049960	0.057700	0.061540	0.066120	0.097440
radius_se	569.000000	0.405172	0.277313	0.111500	0.232400	0.324200	0.478900	2.873000
texture_se	569.000000	1.216853	0.551648	0.360200	0.833900	1.108000	1.474000	4.885000
perimeter_se	569.000000	2.866059	2.021855	0.757000	1.606000	2.287000	3.357000	21.980000
area_se	569.000000	40.337079	45.491006	6.802000	17.850000	24.530000	45.190000	542.200000
smoothness_se	569.000000	0.007041	0.003003	0.001713	0.005169	0.006380	0.008146	0.031130
compactness_se	569.000000	0.025478	0.017908	0.002252	0.013080	0.020450	0.032450	0.135400
concavity_se	569.000000	0.031894	0.030186	0.000000	0.015090	0.025890	0.042050	0.396000
concave points_se	569.000000	0.011796	0.006170	0.000000	0.007638	0.010930	0.014710	0.052790
symmetry_se	569.000000	0.020542	0.008266	0.007882	0.015160	0.018730	0.023480	0.078950
fractal_dimension_se	569.000000	0.003795	0.002646	0.000895	0.002248	0.003187	0.004558	0.029840
radius_worst	569.000000	16.269190	4.833242	7.930000	13.010000	14.970000	18.790000	36.040000
texture_worst	569.000000	25.677223	6.146258	12.020000	21.080000	25.410000	29.720000	49.540000
perimeter_worst	569.000000	107.261213	33.602542	50.410000	84.110000	97.660000	125.400000	251.200000
area_worst	569.000000	880.583128	569.356993	185.200000	515.300000	686.500000	1084.000000	4254.000000
smoothness_worst	569.000000	0.132369	0.022832	0.071170	0.116600	0.131300	0.146000	0.222600
compactness_worst	569.000000	0.254265	0.157336	0.027290	0.147200	0.211900	0.339100	1.058000
concavity_worst	569.000000	0.272188	0.208624	0.000000	0.114500	0.226700	0.382900	1.252000
concave points_worst	569.000000	0.114606	0.065732	0.000000	0.064930	0.099930	0.161400	0.291000
symmetry_worst	569.000000	0.290076	0.061867	0.156500	0.250400	0.282200	0.317900	0.663800
fractal_dimension_worst	569.000000	0.083946	0.018061	0.055040	0.071460	0.080040	0.092080	0.207500

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	diagnosis
0	0.521037	0.022658	0.545989	0.363733	0.593753	0.792037	0.703140	0.731113	0.686364	0.605518	...	0.141525	0.668310	0.450698	0.601136	0.619292	0.568610	0.912027	0.598462	0.418864	1.0
1	0.643144	0.272574	0.615783	0.501591	0.289880	0.181768	0.203608	0.348757	0.379798	0.141323	...	0.303571	0.539818	0.435214	0.347553	0.154563	0.192971	0.639175	0.233590	0.222878	1.0
2	0.601496	0.390260	0.595743	0.449417	0.514309	0.431017	0.462512	0.635686	0.509596	0.211247	...	0.360075	0.508442	0.374508	0.483590	0.385375	0.359744	0.835052	0.403706	0.213433	1.0
3	0.210090	0.360839	0.233501	0.102906	0.811321	0.811361	0.565604	0.522863	0.776263	1.000000	...	0.385928	0.241347	0.094008	0.915472	0.814012	0.548642	0.884880	1.000000	0.773711	1.0
4	0.629893	0.156578	0.630986	0.489290	0.430351	0.347893	0.463918	0.518390	0.378283	0.186816	...	0.123934	0.506948	0.341575	0.437364	0.172415	0.319489	0.558419	0.157500	0.142595	1.0

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	diagnosis
0	0.308060	0.425769	0.297975	0.177094	0.314977	0.176676	0.111317	0.168191	0.378283	0.152064	...	0.527719	0.241994	0.126229	0.297365	0.139525	0.182268	0.440550	0.257441	0.092680	0.0
1	0.135643	0.201894	0.132748	0.063499	0.381782	0.198791	0.054592	0.120080	0.165152	0.399115	...	0.292377	0.119080	0.047016	0.467080	0.191140	0.067364	0.224330	0.184703	0.243015	0.0
2	0.239907	0.166385	0.236680	0.129714	0.455629	0.219434	0.154452	0.136630	0.310606	0.220514	...	0.231343	0.196574	0.097670	0.516608	0.182699	0.243610	0.225017	0.232998	0.183458	0.0
3	0.247953	0.349341	0.246562	0.131326	0.514309	0.293908	0.191542	0.107654	0.537374	0.399747	...	0.323827	0.172917	0.081130	0.455854	0.198126	0.282348	0.277938	0.225508	0.218746	0.0
4	0.204411	0.286777	0.208279	0.104305	0.390810	0.346973	0.362699	0.141849	0.502020	0.562974	...	0.424840	0.183027	0.070709	0.419534	0.443878	0.593930	0.418557	0.343584	0.489702	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
333	0.569786	0.503213	0.540460	0.395546	0.339984	0.310472	0.343955	0.411083	0.451010	0.133319	...	0.490139	0.510434	0.353372	0.354817	0.284571	0.459665	0.672165	0.471319	0.248196	1.0
334	0.428274	0.196145	0.428512	0.275589	0.381692	0.361082	0.282099	0.349950	0.364646	0.206403	...	0.265458	0.367996	0.217460	0.477646	0.407981	0.395847	0.680756	0.286615	0.237439	1.0
335	0.341190	0.476835	0.339161	0.198176	0.379164	0.341145	0.261246	0.321173	0.593434	0.302654	...	0.608475	0.321679	0.153878	0.559532	0.367329	0.299042	0.608935	0.622708	0.311951	1.0
336	0.552747	0.250592	0.536314	0.395970	0.476393	0.277958	0.341378	0.430666	0.457576	0.256318	...	0.343284	0.473081	0.335185	0.522552	0.195797	0.261342	0.575258	0.261975	0.193625	1.0
337	0.331251	0.335137	0.327068	0.193425	0.481809	0.288080	0.263824	0.321223	0.307576	0.326032	...	0.500533	0.316201	0.168133	0.595192	0.319692	0.325000	0.627835	0.318155	0.330972	1.0

	radius_mean	texture_mean	perimeter_mean	area_mean	smoothness_mean	compactness_mean	concavity_mean	concave points_mean	symmetry_mean	fractal_dimension_mean	...	texture_worst	perimeter_worst	area_worst	smoothness_worst	compactness_worst	concavity_worst	concave points_worst	symmetry_worst	fractal_dimension_worst	diagnosis
0	0.096928	0.257694	0.103656	0.045387	0.487226	0.373965	0.733365	0.217445	0.530808	0.642376	...	0.283316	0.075153	0.034285	0.508684	0.397018	1.000000	0.601375	0.524936	0.409681	0.0
1	0.667755	0.570172	0.683505	0.495228	0.554934	0.809214	0.582709	0.743539	0.674242	0.505897	...	0.571962	0.627970	0.467902	0.514627	0.709327	0.541534	0.997595	0.499310	0.481175	1.0
2	0.103744	0.140345	0.106489	0.049799	0.221901	0.208975	0.140300	0.108350	0.646970	0.414280	...	0.192164	0.075601	0.030697	0.179555	0.136324	0.111581	0.174811	0.338459	0.195855	0.0
3	0.173648	0.524518	0.167369	0.086320	0.396678	0.162444	0.055740	0.080268	0.422727	0.280750	...	0.617537	0.137308	0.066482	0.519910	0.109158	0.089856	0.210859	0.363493	0.173357	0.0
4	0.150930	0.174839	0.143459	0.071432	0.548614	0.187811	0.025398	0.064115	0.850000	0.413648	...	0.144723	0.096867	0.045075	0.371987	0.069244	0.017316	0.088625	0.392667	0.165027	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
567	0.550381	0.356442	0.541151	0.403181	0.377088	0.267530	0.349110	0.384245	0.321717	0.148062	...	0.406183	0.445690	0.299302	0.413590	0.178916	0.275240	0.512027	0.152967	0.125738	1.0
568	0.593923	0.769699	0.581922	0.457900	0.285005	0.287160	0.268276	0.329871	0.185859	0.066765	...	0.889925	0.646397	0.563262	0.459816	0.371016	0.319089	0.558419	0.226296	0.135380	1.0
569	0.404610	0.806561	0.414000	0.255101	0.484517	0.443286	0.410262	0.417445	0.520707	0.348357	...	1.000000	0.377957	0.208858	0.773493	0.513345	0.455511	0.692096	0.383797	0.428703	1.0
570	0.638885	0.397362	0.613019	0.493107	0.279137	0.196614	0.211856	0.299304	0.205556	0.038121	...	0.377132	0.554261	0.384585	0.340950	0.197737	0.252236	0.496564	0.132663	0.106454	1.0
571	0.397037	0.441326	0.389814	0.248017	0.355421	0.258328	0.262887	0.371918	0.331818	0.231887	...	0.368337	0.284327	0.158695	0.360100	0.167273	0.227316	0.507216	0.195348	0.086842	1.0

	priors priors: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.	None
	var_smoothing var_smoothing: float, default=1e-9 Portion of the largest variance of all features that is added to variances for calculation stability. .. versionadded:: 0.20	1e-09

Clasificacion de Diagnostico de Cancer de Mama con Machine Learning¶

1. Analisis Exploratorio de Datos (EDA)¶

Mapa de Calor de Correlaciones¶

Interpretacion de Correlaciones y Seleccion de Variables¶

2. Preprocesamiento¶

3. Manejo de Desbalance de Clases¶

3.1 Random Under-Sampling¶

3.2 Random Over-Sampling¶

3.3 Matrices de Correlacion Post-Remuestreo¶

4. Modelamiento y Evaluacion (Under-Sampling)¶

4.1 Matriz de Confusion y Metricas (Under-Sampling)¶

5. Modelamiento y Evaluacion (Over-Sampling)¶

6. Comparacion Final¶

	C C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.	0.7
	kernel kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.	'linear'
	degree degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.	3
	gamma gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'.	'scale'
	coef0 coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.	0.0
	shrinking shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.	True
	probability probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.	False
	tol tol: float, default=1e-3 Tolerance for stopping criterion.	0.001
	cache_size cache_size: float, default=200 Specify the size of the kernel cache (in MB).	200
	class_weight class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]C for SVC. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes np.bincount(y))``.	None
	verbose verbose: bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context.	False
	max_iter max_iter: int, default=-1 Hard limit on iterations within solver, or -1 for no limit.	-1
	decision_function_shape decision_function_shape: {'ovo', 'ovr'}, default='ovr' Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, note that internally, one-vs-one ('ovo') is always used as a multi-class strategy to train models; an ovr matrix is only constructed from the ovo matrix. The parameter is ignored for binary classification. .. versionchanged:: 0.19 decision_function_shape is 'ovr' by default. .. versionadded:: 0.17 decision_function_shape='ovr' is recommended. .. versionchanged:: 0.17 Deprecated decision_function_shape='ovo' and None.	'ovr'
	break_ties break_ties: bool, default=False If true, ``decision_function_shape='ovr'``, and number of classes > 2, :term:`predict` will break ties according to the confidence values of :term:`decision_function`; otherwise the first class among the tied classes is returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple predict. See :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an example of its usage with ``decision_function_shape='ovr'``. .. versionadded:: 0.22	False
	random_state random_state: int, RandomState instance or None, default=None Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when `probability` is False. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.	None

	criterion criterion: {"gini", "entropy", "log_loss"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "log_loss" and "entropy" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.	'gini'
	splitter splitter: {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.	'best'
	max_depth max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.	2
	min_samples_split min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for fractions.	2
	min_samples_leaf min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions.	5
	min_weight_fraction_leaf min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.	0.0
	max_features max_features: int, float or {"sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note:: The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.	None
	random_state random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``"best"``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.	None
	max_leaf_nodes max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.	None
	min_impurity_decrease min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19	0.0
	class_weight class_weight: dict, list of dict or "balanced", default=None Weights associated with classes in the form ``{class_label: weight}``. If None, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of [{1:1}, {2:5}, {3:1}, {4:1}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.	None
	ccp_alpha ccp_alpha: non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. See :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` for an example of such pruning. .. versionadded:: 0.22	0.0
	monotonic_cst monotonic_cst: array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase - 0: no constraint - -1: monotonic decrease If monotonic_cst is None, no constraints are applied. Monotonicity constraints are not supported for: - multiclass classifications (i.e. when `n_classes > 2`), - multioutput classifications (i.e. when `n_outputs_ > 1`), - classifications trained on data with missing values. The constraints hold over the probability of the positive class. Read more in the :ref:`User Guide `. .. versionadded:: 1.4	None

	C C: float, default=1.0 Regularization parameter. The strength of the regularization is inversely proportional to C. Must be strictly positive. The penalty is a squared l2 penalty. For an intuitive visualization of the effects of scaling the regularization parameter C, see :ref:`sphx_glr_auto_examples_svm_plot_svm_scale_c.py`.	0.9
	kernel kernel: {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'} or callable, default='rbf' Specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. If a callable is given it is used to pre-compute the kernel matrix from data matrices; that matrix should be an array of shape ``(n_samples, n_samples)``. For an intuitive visualization of different kernel types see :ref:`sphx_glr_auto_examples_svm_plot_svm_kernels.py`.	'rbf'
	degree degree: int, default=3 Degree of the polynomial kernel function ('poly'). Must be non-negative. Ignored by all other kernels.	3
	gamma gamma: {'scale', 'auto'} or float, default='scale' Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma, - if 'auto', uses 1 / n_features - if float, must be non-negative. .. versionchanged:: 0.22 The default value of ``gamma`` changed from 'auto' to 'scale'.	'scale'
	coef0 coef0: float, default=0.0 Independent term in kernel function. It is only significant in 'poly' and 'sigmoid'.	0.0
	shrinking shrinking: bool, default=True Whether to use the shrinking heuristic. See the :ref:`User Guide `.	True
	probability probability: bool, default=False Whether to enable probability estimates. This must be enabled prior to calling `fit`, will slow down that method as it internally uses 5-fold cross-validation, and `predict_proba` may be inconsistent with `predict`. Read more in the :ref:`User Guide `.	False
	tol tol: float, default=1e-3 Tolerance for stopping criterion.	0.001
	cache_size cache_size: float, default=200 Specify the size of the kernel cache (in MB).	200
	class_weight class_weight: dict or 'balanced', default=None Set the parameter C of class i to class_weight[i]C for SVC. If not given, all classes are supposed to have weight one. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes np.bincount(y))``.	None
	verbose verbose: bool, default=False Enable verbose output. Note that this setting takes advantage of a per-process runtime setting in libsvm that, if enabled, may not work properly in a multithreaded context.	False
	max_iter max_iter: int, default=-1 Hard limit on iterations within solver, or -1 for no limit.	-1
	decision_function_shape decision_function_shape: {'ovo', 'ovr'}, default='ovr' Whether to return a one-vs-rest ('ovr') decision function of shape (n_samples, n_classes) as all other classifiers, or the original one-vs-one ('ovo') decision function of libsvm which has shape (n_samples, n_classes * (n_classes - 1) / 2). However, note that internally, one-vs-one ('ovo') is always used as a multi-class strategy to train models; an ovr matrix is only constructed from the ovo matrix. The parameter is ignored for binary classification. .. versionchanged:: 0.19 decision_function_shape is 'ovr' by default. .. versionadded:: 0.17 decision_function_shape='ovr' is recommended. .. versionchanged:: 0.17 Deprecated decision_function_shape='ovo' and None.	'ovr'
	break_ties break_ties: bool, default=False If true, ``decision_function_shape='ovr'``, and number of classes > 2, :term:`predict` will break ties according to the confidence values of :term:`decision_function`; otherwise the first class among the tied classes is returned. Please note that breaking ties comes at a relatively high computational cost compared to a simple predict. See :ref:`sphx_glr_auto_examples_svm_plot_svm_tie_breaking.py` for an example of its usage with ``decision_function_shape='ovr'``. .. versionadded:: 0.22	False
	random_state random_state: int, RandomState instance or None, default=None Controls the pseudo random number generation for shuffling the data for probability estimates. Ignored when `probability` is False. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `.	None

	criterion criterion: {"gini", "entropy", "log_loss"}, default="gini" The function to measure the quality of a split. Supported criteria are "gini" for the Gini impurity and "log_loss" and "entropy" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`.	'entropy'
	splitter splitter: {"best", "random"}, default="best" The strategy used to choose the split at each node. Supported strategies are "best" to choose the best split and "random" to choose the best random split.	'best'
	max_depth max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.	3
	min_samples_split min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and `ceil(min_samples_split * n_samples)` are the minimum number of samples for each split. .. versionchanged:: 0.18 Added float values for fractions.	2
	min_samples_leaf min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and `ceil(min_samples_leaf * n_samples)` are the minimum number of samples for each node. .. versionchanged:: 0.18 Added float values for fractions.	6
	min_weight_fraction_leaf min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.	0.0
	max_features max_features: int, float or {"sqrt", "log2"}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and `max(1, int(max_features * n_features_in_))` features are considered at each split. - If "sqrt", then `max_features=sqrt(n_features)`. - If "log2", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. note:: The search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.	None
	random_state random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``"best"``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.	None
	max_leaf_nodes max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.	None
	min_impurity_decrease min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following:: N_t / N * (impurity - N_t_R / N_t * right_impurity - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19	0.0
	class_weight class_weight: dict, list of dict or "balanced", default=None Weights associated with classes in the form ``{class_label: weight}``. If None, all classes are supposed to have weight one. For multi-output problems, a list of dicts can be provided in the same order as the columns of y. Note that for multioutput (including multilabel) weights should be defined for each class of every column in its own dict. For example, for four-class multilabel classification weights should be [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of [{1:1}, {2:5}, {3:1}, {4:1}]. The "balanced" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))`` For multi-output, the weights of each column of y will be multiplied. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified.	None
	ccp_alpha ccp_alpha: non-negative float, default=0.0 Complexity parameter used for Minimal Cost-Complexity Pruning. The subtree with the largest cost complexity that is smaller than ``ccp_alpha`` will be chosen. By default, no pruning is performed. See :ref:`minimal_cost_complexity_pruning` for details. See :ref:`sphx_glr_auto_examples_tree_plot_cost_complexity_pruning.py` for an example of such pruning. .. versionadded:: 0.22	0.0
	monotonic_cst monotonic_cst: array-like of int of shape (n_features), default=None Indicates the monotonicity constraint to enforce on each feature. - 1: monotonic increase - 0: no constraint - -1: monotonic decrease If monotonic_cst is None, no constraints are applied. Monotonicity constraints are not supported for: - multiclass classifications (i.e. when `n_classes > 2`), - multioutput classifications (i.e. when `n_outputs_ > 1`), - classifications trained on data with missing values. The constraints hold over the probability of the positive class. Read more in the :ref:`User Guide `. .. versionadded:: 1.4	None