Pyton para enfermeiras (23)¶
Prevendo a ocorrência de uma condição clínica¶
In [6]:
# modelo do estudo para prever a ocorrência de diabetes, por exemplo
from IPython.display import Image
Image('Workflow.png')
Out[6]:
In [7]:
# Importando os módulos ou bibliotecas de trabalho
import pandas as pd
import matplotlib as mat
import matplotlib.pyplot as plt
import numpy as np
# criação de gráficos
%matplotlib inline
In [8]:
# Carregando o dataset ou df (que está no diretório do Jupyter Lab)
df = pd.read_csv("pima-data.csv")
In [9]:
# Verificando o formato dos dados
# preparação dos dados
df.shape
Out[9]:
(768, 10)
In [10]:
# Verificando as 5 primeiras linhas do dataset
df.head(5)
Out[10]:
| num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1.3780 | True |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 1.1426 | False |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 0.0000 | True |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0.9062 | False |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1.3790 | True |
In [11]:
# Verificando as últimas linhas do dataset
df.tail(5)
Out[11]:
| num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 1.8912 | False |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 1.0638 | False |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0.9062 | False |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 0.0000 | True |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 1.2214 | False |
In [12]:
# Verificando se existem valores nulos
df.isnull().values.any()
Out[12]:
False
In [13]:
# Identificando a correlação entre as variáveis
# Correlação não implica causalidade
def plot_corr(df, size=10):
corr = df.corr()
fig, ax = plt.subplots(figsize = (size, size))
ax.matshow(corr)
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
In [14]:
# Criando o gráfico sobre a a correlação (ou não) entre as variáveis
plot_corr(df)
In [15]:
# Visualizando a mesma correlação em tabela
# Coeficiente de correlação:
# +1 = forte correlação positiva
# 0 = não há correlação
# -1 = forte correlação negativa
df.corr()
Out[15]:
| num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
|---|---|---|---|---|---|---|---|---|---|---|
| num_preg | 1.000000 | 0.129459 | 0.141282 | -0.081672 | -0.073535 | 0.017683 | -0.033523 | 0.544341 | -0.081673 | 0.221898 |
| glucose_conc | 0.129459 | 1.000000 | 0.152590 | 0.057328 | 0.331357 | 0.221071 | 0.137337 | 0.263514 | 0.057326 | 0.466581 |
| diastolic_bp | 0.141282 | 0.152590 | 1.000000 | 0.207371 | 0.088933 | 0.281805 | 0.041265 | 0.239528 | 0.207371 | 0.065068 |
| thickness | -0.081672 | 0.057328 | 0.207371 | 1.000000 | 0.436783 | 0.392573 | 0.183928 | -0.113970 | 1.000000 | 0.074752 |
| insulin | -0.073535 | 0.331357 | 0.088933 | 0.436783 | 1.000000 | 0.197859 | 0.185071 | -0.042163 | 0.436785 | 0.130548 |
| bmi | 0.017683 | 0.221071 | 0.281805 | 0.392573 | 0.197859 | 1.000000 | 0.140647 | 0.036242 | 0.392574 | 0.292695 |
| diab_pred | -0.033523 | 0.137337 | 0.041265 | 0.183928 | 0.185071 | 0.140647 | 1.000000 | 0.033561 | 0.183927 | 0.173844 |
| age | 0.544341 | 0.263514 | 0.239528 | -0.113970 | -0.042163 | 0.036242 | 0.033561 | 1.000000 | -0.113973 | 0.238356 |
| skin | -0.081673 | 0.057326 | 0.207371 | 1.000000 | 0.436785 | 0.392574 | 0.183927 | -0.113973 | 1.000000 | 0.074750 |
| diabetes | 0.221898 | 0.466581 | 0.065068 | 0.074752 | 0.130548 | 0.292695 | 0.173844 | 0.238356 | 0.074750 | 1.000000 |
In [16]:
''' Machine Learning só entende dados numéricos, por isso...preciso usar a função "map"
para converter True/False em um número'''
# Definindo as classes
diabetes_map = {True : 1, False : 0}
In [17]:
# Aplicando o mapeamento ao dataset
df['diabetes'] = df['diabetes'].map(diabetes_map)
In [18]:
# Verificando as primeiras linhas do dataset quanto à alteração
df.head(5)
Out[18]:
| num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1.3780 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 1.1426 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 0.0000 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0.9062 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1.3790 | 1 |
In [19]:
# Verificando como os dados estão distribuídos
num_true = len(df.loc[df['diabetes'] == True])
num_false = len(df.loc[df['diabetes'] == False])
print("Número de Casos Verdadeiros: {0} ({1:2.2f}%)".format(num_true, (num_true/ (num_true + num_false)) * 100))
print("Número de Casos Falsos : {0} ({1:2.2f}%)".format(num_false, (num_false/ (num_true + num_false)) * 100))
Número de Casos Verdadeiros: 268 (34.90%) Número de Casos Falsos : 500 (65.10%)
Spliting: dividindo os dados em treino (70%) e teste (30%)¶
In [20]:
from IPython.display import Image
Image('Treinamento.png')
Out[20]:
In [21]:
import sklearn as sk
sk.__version__
Out[21]:
'0.23.2'
In [22]:
# uma função que cria treino e teste
from sklearn.model_selection import train_test_split
In [23]:
# Seleção de variáveis preditoras (Feature Selection)...faltou o quesito cor.
atributos = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_pred', 'age']
In [24]:
# Variável a ser prevista
atrib_prev = ['diabetes']
In [25]:
# Criando objetos
X = df[atributos].values
Y = df[atrib_prev].values
In [26]:
X
Out[26]:
array([[ 6. , 148. , 72. , ..., 33.6 , 0.627, 50. ],
[ 1. , 85. , 66. , ..., 26.6 , 0.351, 31. ],
[ 8. , 183. , 64. , ..., 23.3 , 0.672, 32. ],
...,
[ 5. , 121. , 72. , ..., 26.2 , 0.245, 30. ],
[ 1. , 126. , 60. , ..., 30.1 , 0.349, 47. ],
[ 1. , 93. , 70. , ..., 30.4 , 0.315, 23. ]])
In [27]:
Y
Out[27]:
array([[1],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[1],
[0],
[1],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[1],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[1],
[1],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0]], dtype=int64)
In [28]:
# Definindo a taxa de split
split_test_size = 0.30
In [29]:
# Criando dados de treino e de teste
X_treino, X_teste, Y_treino, Y_teste = train_test_split(X, Y, test_size = split_test_size, random_state = 42)
In [30]:
# Imprimindo os resultados
print("{0:0.2f}% nos dados de treino".format((len(X_treino)/len(df.index)) * 100))
print("{0:0.2f}% nos dados de teste".format((len(X_teste)/len(df.index)) * 100))
69.92% nos dados de treino 30.08% nos dados de teste
In [31]:
X_treino
Out[31]:
array([[ 1. , 95. , 60. , ..., 23.9 , 0.26 , 22. ],
[ 5. , 105. , 72. , ..., 36.9 , 0.159, 28. ],
[ 0. , 135. , 68. , ..., 42.3 , 0.365, 24. ],
...,
[ 10. , 101. , 86. , ..., 45.6 , 1.136, 38. ],
[ 0. , 141. , 0. , ..., 42.4 , 0.205, 29. ],
[ 0. , 125. , 96. , ..., 22.5 , 0.262, 21. ]])
In [32]:
X_teste
Out[32]:
array([[6.00e+00, 9.80e+01, 5.80e+01, ..., 3.40e+01, 4.30e-01, 4.30e+01],
[2.00e+00, 1.12e+02, 7.50e+01, ..., 3.57e+01, 1.48e-01, 2.10e+01],
[2.00e+00, 1.08e+02, 6.40e+01, ..., 3.08e+01, 1.58e-01, 2.10e+01],
...,
[0.00e+00, 1.27e+02, 8.00e+01, ..., 3.63e+01, 8.04e-01, 2.30e+01],
[6.00e+00, 1.05e+02, 7.00e+01, ..., 3.08e+01, 1.22e-01, 3.70e+01],
[5.00e+00, 7.70e+01, 8.20e+01, ..., 3.58e+01, 1.56e-01, 3.50e+01]])
In [33]:
Y_treino
Out[33]:
array([[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[1],
[1],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[0]], dtype=int64)
In [34]:
Y_teste
Out[34]:
array([[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1],
[1],
[1],
[0],
[1],
[1],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[1],
[1],
[0],
[1],
[1],
[0],
[1],
[1],
[0],
[1],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[1],
[0],
[1],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[1],
[0],
[0],
[0],
[0],
[0],
[1],
[1],
[0],
[0],
[0]], dtype=int64)
Verificando o split¶
In [35]:
print("Original True : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 1]),
(len(df.loc[df['diabetes'] ==1])/len(df.index) * 100)))
print("Original False : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 0]),
(len(df.loc[df['diabetes'] == 0])/len(df.index) * 100)))
print("")
print("Training True : {0} ({1:0.2f}%)".format(len(Y_treino[Y_treino[:] == 1]),
(len(Y_treino[Y_treino[:] == 1])/len(Y_treino) * 100)))
print("Training False : {0} ({1:0.2f}%)".format(len(Y_treino[Y_treino[:] == 0]),
(len(Y_treino[Y_treino[:] == 0])/len(Y_treino) * 100)))
print("")
print("Test True : {0} ({1:0.2f}%)".format(len(Y_teste[Y_teste[:] == 1]),
(len(Y_teste[Y_teste[:] == 1])/len(Y_teste) * 100)))
print("Test False : {0} ({1:0.2f}%)".format(len(Y_teste[Y_teste[:] == 0]),
(len(Y_teste[Y_teste[:] == 0])/len(Y_teste) * 100)))
Original True : 268 (34.90%) Original False : 500 (65.10%) Training True : 188 (35.01%) Training False : 349 (64.99%) Test True : 80 (34.63%) Test False : 151 (65.37%)
Valores missing ou ocultos¶
In [36]:
df.head(5)
Out[36]:
| num_preg | glucose_conc | diastolic_bp | thickness | insulin | bmi | diab_pred | age | skin | diabetes | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1.3780 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 1.1426 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 0.0000 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0.9062 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1.3790 | 1 |
In [37]:
print("# Linhas no dataframe {0}".format(len(df)))
print("# Linhas missing glucose_conc: {0}".format(len(df.loc[df['glucose_conc'] == 0])))
print("# Linhas missing diastolic_bp: {0}".format(len(df.loc[df['diastolic_bp'] == 0])))
print("# Linhas missing thickness: {0}".format(len(df.loc[df['thickness'] == 0])))
print("# Linhas missing insulin: {0}".format(len(df.loc[df['insulin'] == 0])))
print("# Linhas missing bmi: {0}".format(len(df.loc[df['bmi'] == 0])))
print("# Linhas missing age: {0}".format(len(df.loc[df['age'] == 0])))
# Linhas no dataframe 768 # Linhas missing glucose_conc: 5 # Linhas missing diastolic_bp: 35 # Linhas missing thickness: 227 # Linhas missing insulin: 374 # Linhas missing bmi: 11 # Linhas missing age: 0
Tratando Dados Missing - Impute¶
In [45]:
# corrigir o dataset quanto ao valor missing oculto
from sklearn.impute import SimpleImputer
In [46]:
# Criando objeto
preenche_0 = SimpleImputer(missing_values = 0, strategy = "mean")
# Substituindo os valores iguais a zero, pela média dos dados
X_treino = preenche_0.fit_transform(X_treino)
X_teste = preenche_0.fit_transform(X_teste)
In [47]:
X_treino
Out[47]:
array([[ 1. , 95. , 60. , ..., 23.9 ,
0.26 , 22. ],
[ 5. , 105. , 72. , ..., 36.9 ,
0.159 , 28. ],
[ 4.34056399, 135. , 68. , ..., 42.3 ,
0.365 , 24. ],
...,
[ 10. , 101. , 86. , ..., 45.6 ,
1.136 , 38. ],
[ 4.34056399, 141. , 72.24131274, ..., 42.4 ,
0.205 , 29. ],
[ 4.34056399, 125. , 96. , ..., 22.5 ,
0.262 , 21. ]])
In [48]:
X_teste
Out[48]:
array([[6.00000000e+00, 9.80000000e+01, 5.80000000e+01, ...,
3.40000000e+01, 4.30000000e-01, 4.30000000e+01],
[2.00000000e+00, 1.12000000e+02, 7.50000000e+01, ...,
3.57000000e+01, 1.48000000e-01, 2.10000000e+01],
[2.00000000e+00, 1.08000000e+02, 6.40000000e+01, ...,
3.08000000e+01, 1.58000000e-01, 2.10000000e+01],
...,
[4.85714286e+00, 1.27000000e+02, 8.00000000e+01, ...,
3.63000000e+01, 8.04000000e-01, 2.30000000e+01],
[6.00000000e+00, 1.05000000e+02, 7.00000000e+01, ...,
3.08000000e+01, 1.22000000e-01, 3.70000000e+01],
[5.00000000e+00, 7.70000000e+01, 8.20000000e+01, ...,
3.58000000e+01, 1.56000000e-01, 3.50000000e+01]])
Construindo e treinando o modelo de ML¶
In [56]:
# Utilizando um classificador Naive Bayes
from sklearn.naive_bayes import GaussianNB
In [57]:
# Criando o modelo preditivo
modelo_v1 = GaussianNB()
In [58]:
# Treinando o modelo
modelo_v1.fit(X_treino, Y_treino.ravel())
Out[58]:
GaussianNB()
é exato no treino???¶
In [59]:
from sklearn import metrics
In [60]:
nb_predict_train = modelo_v1.predict(X_treino) # X são as variáveis preditoras, de entrada
In [61]:
# avaliar a acurácia do modelo
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_treino, nb_predict_train)))
print()
Exatidão (Accuracy): 0.7542
In [62]:
# avaliar o teste
nb_predict_test = modelo_v1.predict(X_teste)
In [63]:
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_teste, nb_predict_test)))
print()
Exatidão (Accuracy): 0.7359
Os dados de teste são novos dados e mostra uma acurácia mais precisa
Métricas¶
In [64]:
from IPython.display import Image
Image('ConfusionMatrix.jpg')
Out[64]:
In [65]:
# matriz para avaliar as taxas de erro do modelo
# Criando uma Confusion Matrix
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(Y_teste, nb_predict_test, labels = [1, 0])))
print("")
print("Classification Report")
print(metrics.classification_report(Y_teste, nb_predict_test, labels = [1, 0]))
Confusion Matrix
[[ 52 28]
[ 33 118]]
Classification Report
precision recall f1-score support
1 0.61 0.65 0.63 80
0 0.81 0.78 0.79 151
accuracy 0.74 231
macro avg 0.71 0.72 0.71 231
weighted avg 0.74 0.74 0.74 231
Otimizando o modelo com RandomForest: algoritmo de árvore de decisão¶
In [66]:
from sklearn.ensemble import RandomForestClassifier
In [68]:
modelo_v2 = RandomForestClassifier(random_state = 42)
modelo_v2.fit(X_treino, Y_treino.ravel())
Out[68]:
RandomForestClassifier(random_state=42)
In [69]:
# Verificando os dados de treino
rf_predict_train = modelo_v2.predict(X_treino)
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_treino, rf_predict_train)))
Exatidão (Accuracy): 1.0000
In [70]:
# Verificando nos dados de teste
rf_predict_test = modelo_v2.predict(X_teste)
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_teste, rf_predict_test)))
print()
Exatidão (Accuracy): 0.7403
In [71]:
print("Confusion Matrix")
print("{0}".format(metrics.confusion_matrix(Y_teste, rf_predict_test, labels = [1, 0])))
print("")
print("Classification Report")
print(metrics.classification_report(Y_teste, rf_predict_test, labels = [1, 0]))
Confusion Matrix
[[ 52 28]
[ 32 119]]
Classification Report
precision recall f1-score support
1 0.62 0.65 0.63 80
0 0.81 0.79 0.80 151
accuracy 0.74 231
macro avg 0.71 0.72 0.72 231
weighted avg 0.74 0.74 0.74 231
Regressão logística: um algoritmo de classificação¶
In [72]:
from sklearn.linear_model import LogisticRegression
In [73]:
# Terceira versão do modelo usando Regressão Logística
modelo_v3 = LogisticRegression(C = 0.7, random_state = 42, max_iter = 1000)
modelo_v3.fit(X_treino, Y_treino.ravel())
lr_predict_test = modelo_v3.predict(X_teste)
In [74]:
print("Exatidão (Accuracy): {0:.4f}".format(metrics.accuracy_score(Y_teste, lr_predict_test)))
print()
print("Classification Report")
print(metrics.classification_report(Y_teste, lr_predict_test, labels = [1, 0]))
Exatidão (Accuracy): 0.7359
Classification Report
precision recall f1-score support
1 0.63 0.59 0.61 80
0 0.79 0.81 0.80 151
accuracy 0.74 231
macro avg 0.71 0.70 0.70 231
weighted avg 0.73 0.74 0.73 231
Previsões Com o Modelo Treinado¶
In [76]:
import pickle
In [77]:
# Salvando o modelo para usar mais tarde
filename = 'modelo_treinado_v3.sav'
pickle.dump(modelo_v3, open(filename, 'wb'))
In [78]:
X_teste
Out[78]:
array([[6.00000000e+00, 9.80000000e+01, 5.80000000e+01, ...,
3.40000000e+01, 4.30000000e-01, 4.30000000e+01],
[2.00000000e+00, 1.12000000e+02, 7.50000000e+01, ...,
3.57000000e+01, 1.48000000e-01, 2.10000000e+01],
[2.00000000e+00, 1.08000000e+02, 6.40000000e+01, ...,
3.08000000e+01, 1.58000000e-01, 2.10000000e+01],
...,
[4.85714286e+00, 1.27000000e+02, 8.00000000e+01, ...,
3.63000000e+01, 8.04000000e-01, 2.30000000e+01],
[6.00000000e+00, 1.05000000e+02, 7.00000000e+01, ...,
3.08000000e+01, 1.22000000e-01, 3.70000000e+01],
[5.00000000e+00, 7.70000000e+01, 8.20000000e+01, ...,
3.58000000e+01, 1.56000000e-01, 3.50000000e+01]])
In [83]:
# Carregando o modelo e fazendo previsão com novos conjuntos de dados
# (X_teste, Y_teste devem ser novos conjuntos de dados preparados com o procedimento de limpeza e transformação adequados)
loaded_model = pickle.load(open(filename, 'rb'))
resultado1 = loaded_model.predict(X_teste[15].reshape(1, -1))
resultado2 = loaded_model.predict(X_teste[18].reshape(1, -1))
print(resultado1)
print(resultado2)
[0] [1]
Apontamentos
- Não há apontamentos.
BNN - ISSN 1676-4893
Boletim do Núcleo de Estudos e Pesquisas sobre as Atividades de Enfermagem (NEPAE)e do Núcleo de Estudos sobre Saúde e Etnia Negra (NESEN).
