ScikitLearn ML models got cv_results.mean() =0 and cv

ScikitLearn ML models got cv_results.mean() =0 and cv_results.std() = 0

up vote
0
down vote

favorite

I have a dataset (https://github.com/ivonnics/Machine-Learning/blob/master/CJD2.csv) from my celular data usage indicating: Date, Time and Volume.
From the Date feature I separated the different days of the week (mon - sun) and from the Time feature I considered four (4) different time frames (Midnight, Morning, Afternoon and Evening). With those 11 "New" features, I am trying to find a relation between weekday, time frame and the data Volume used.
I modified a Jason Brownlee (@TeachTheMachine) program (you can download the modified version from my github at https://github.com/ivonnics/Machine-Learning/blob/master/Data%20Analytical%20Github.py) and got as result for all different models used: Mean and Standard Deviation equals to zero (0).I don`t understand why...
Any help or recommendation?
The program:

# -*- coding: utf-8 -*-

"""

Created on Sat Nov 10 15:18:54 2018

@author: ivonnics

"""



import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn import model_selection

from sklearn import preprocessing

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from pandas.plotting import scatter_matrix



url = "https://github.com/ivonnics/Machine-Learning/blob/master/CJD2.csv"

dataset = pd.read_html(url)

Tabla=dataset[0]

dataset=Tabla[['Date', 'Time', 'Volume']]



dataset1=[pd.to_datetime(hour, format="%I:%M:%S %p", errors="coerce") for hour in dataset['Time']]



print('-----------------------------------------------------------')

#print('TESTANDO')

dataset2=pd.Series(dataset1).dt.hour

#print(dataset2)

dataset3={'Hour': dataset2}

#print(dataset3)

dataset4=pd.DataFrame(dataset3, columns = ['Hour'])

#print(dataset4.head(20))



print(dataset.head(20))

print('-----------------------------------------------------------')

print(dataset.shape)

print('-----------------------------------------------------------')

print(dataset.describe())

print('-----------------------------------------------------------')





print(dataset.nunique())

print('-----------------------------------------------------------')



print('-----------------------------------------------------------')





df_new1= pd.concat([dataset, dataset4], axis=1)



print('-----------------------------------------------------------')

print(df_new1[(df_new1['Hour'] == 5)])

print('-----------------------------------------------------------')



dataset5=[pd.to_datetime(weekday, format="%m/%d/%Y", errors="coerce") for weekday in dataset['Date']]





dataset6=pd.Series(dataset5).dt.weekday_name

dataset7={'Weekday': dataset6}



dataset8=pd.DataFrame(dataset7, columns = ['Weekday'])



df_new2= pd.concat([df_new1, dataset8], axis=1)



df_new2['Madrugada'] = np.where((df_new2['Hour']>=0) & (df_new2['Hour']<6), 1, 0)

df_new2['Mañana'] = np.where((df_new2['Hour']>=6) & (df_new2['Hour']<12), 1, 0)

df_new2['Tarde'] = np.where((df_new2['Hour']>=12) & (df_new2['Hour']<18), 1, 0)

df_new2['Noche'] = np.where((df_new2['Hour']>=18) & (df_new2['Hour']<24), 1, 0)

df_new2['Lunes'] = np.where((df_new2['Weekday']=='Monday'), 1, 0)

df_new2['Martes'] = np.where((df_new2['Weekday']=='Tuesday'), 1, 0)

df_new2['Miércoles'] = np.where((df_new2['Weekday']=='Wednesday'), 1, 0)

df_new2['Jueves'] = np.where((df_new2['Weekday']=='Thursday'), 1, 0)

df_new2['Viernes'] = np.where((df_new2['Weekday']=='Friday'), 1, 0)

df_new2['Sábado'] = np.where((df_new2['Weekday']=='Saturday'), 1, 0)

df_new2['Domingo'] = np.where((df_new2['Weekday']=='Sunday'), 1, 0)





print(df_new2.shape)

print(df_new2.head(20))





df_new3=df_new2[['Lunes', 'Martes', 'Miércoles', 'Jueves', 'Viernes', 'Sábado', 'Domingo', 'Madrugada', 'Mañana', 'Tarde', 'Noche', 'Volume']]



#Analysis

print(df_new3.shape)

print(df_new3.head(20))

print(dataset.describe())

print(df_new2.groupby('Weekday').size())

print(df_new3.groupby('Madrugada').size())

print(df_new3.groupby('Mañana').size())

print(df_new3.groupby('Tarde').size())

print(df_new3.groupby('Noche').size())

print(df_new3.groupby('Volume').size())

# box and whisker plots

df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False)

plt.show()

# histograms

df_new3.hist()

plt.show()

# scatter plot matrix

scatter_matrix(df_new3)

plt.show()



# Split-out validation dataset

array = df_new3.values

X = array[:,0:11]

#print(X)

Y = array[:,11]





#print(Y)

lab_enc = preprocessing.LabelEncoder()

encoded = lab_enc.fit_transform(Y)

Y=encoded

#print(Y)

print('')



validation_size = 0.20

seed = 7

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)





num_folds = 10

num_instances = len(X_train)

seed = 7

scoring = 'accuracy'





models = 



models.append(('LR', LogisticRegression())) #FUNCIONA!!!

models.append(('KNN', KNeighborsClassifier())) #FUNCIONA!!!

models.append(('CART', DecisionTreeClassifier())) #FUNCIONA!!!

models.append(('NB', GaussianNB())) # FUNCIONA!!!

models.append(('SVM', SVC())) #FUNCIONA!!!

# evaluate each model in turn

results = 

names = 



for name, model in models:

    kfold = model_selection.KFold(n_splits=10, random_state=seed)

    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)

    results.append(cv_results)

    names.append(name)

    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

    print(msg)





# Compare Algorithms

fig = plt.figure()

fig.suptitle('Algorithm Comparison')

ax = fig.add_subplot(111)

plt.boxplot(results)

ax.set_xticklabels(names)

plt.show()

This is the result I got after the model`s evaluation:

LR: 0.000000 (0.000000)

KNN: 0.000000 (0.000000)

CART: 0.000000 (0.000000)

NB: 0.000000 (0.000000)

SVM: 0.000000 (0.000000)

Thanking you in advance for any help...
José

edited Nov 20 at 10:53

desertnaut

15.6k53463

asked Nov 19 at 20:04

José Luis Martínez

258

Are you sure Y = array[:,11] is points at the correct labels?
– Geeocode
Nov 19 at 20:20

I believe so, I printed the array with print(Y) and got the desired result, feel free to download the program and try by yourself...
– José Luis Martínez
Nov 19 at 20:34

Welcome to SO; please see How to create a Minimal, Complete, and Verifiable example, as well as why a wall of code isn't helpful.
– desertnaut
Nov 19 at 20:45

Is Volume a continuous or categorical variable?
– vealkind
Nov 19 at 20:50

Originally it is a continuos variable, but after: 'lab_enc = preprocessing.LabelEncoder() encoded = lab_enc.fit_transform(Y) Y=encoded'. I believe it is transformed into a categorical one...
– José Luis Martínez
Nov 19 at 22:06

add a comment |

up vote
0
down vote

favorite

# -*- coding: utf-8 -*-

"""

Created on Sat Nov 10 15:18:54 2018

@author: ivonnics

"""



import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn import model_selection

from sklearn import preprocessing

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from pandas.plotting import scatter_matrix



url = "https://github.com/ivonnics/Machine-Learning/blob/master/CJD2.csv"

dataset = pd.read_html(url)

Tabla=dataset[0]

dataset=Tabla[['Date', 'Time', 'Volume']]



dataset1=[pd.to_datetime(hour, format="%I:%M:%S %p", errors="coerce") for hour in dataset['Time']]



print('-----------------------------------------------------------')

#print('TESTANDO')

dataset2=pd.Series(dataset1).dt.hour

#print(dataset2)

dataset3={'Hour': dataset2}

#print(dataset3)

dataset4=pd.DataFrame(dataset3, columns = ['Hour'])

#print(dataset4.head(20))



print(dataset.head(20))

print('-----------------------------------------------------------')

print(dataset.shape)

print('-----------------------------------------------------------')

print(dataset.describe())

print('-----------------------------------------------------------')





print(dataset.nunique())

print('-----------------------------------------------------------')



print('-----------------------------------------------------------')





df_new1= pd.concat([dataset, dataset4], axis=1)



print('-----------------------------------------------------------')

print(df_new1[(df_new1['Hour'] == 5)])

print('-----------------------------------------------------------')



dataset5=[pd.to_datetime(weekday, format="%m/%d/%Y", errors="coerce") for weekday in dataset['Date']]





dataset6=pd.Series(dataset5).dt.weekday_name

dataset7={'Weekday': dataset6}



dataset8=pd.DataFrame(dataset7, columns = ['Weekday'])



df_new2= pd.concat([df_new1, dataset8], axis=1)



df_new2['Madrugada'] = np.where((df_new2['Hour']>=0) & (df_new2['Hour']<6), 1, 0)

df_new2['Mañana'] = np.where((df_new2['Hour']>=6) & (df_new2['Hour']<12), 1, 0)

df_new2['Tarde'] = np.where((df_new2['Hour']>=12) & (df_new2['Hour']<18), 1, 0)

df_new2['Noche'] = np.where((df_new2['Hour']>=18) & (df_new2['Hour']<24), 1, 0)

df_new2['Lunes'] = np.where((df_new2['Weekday']=='Monday'), 1, 0)

df_new2['Martes'] = np.where((df_new2['Weekday']=='Tuesday'), 1, 0)

df_new2['Miércoles'] = np.where((df_new2['Weekday']=='Wednesday'), 1, 0)

df_new2['Jueves'] = np.where((df_new2['Weekday']=='Thursday'), 1, 0)

df_new2['Viernes'] = np.where((df_new2['Weekday']=='Friday'), 1, 0)

df_new2['Sábado'] = np.where((df_new2['Weekday']=='Saturday'), 1, 0)

df_new2['Domingo'] = np.where((df_new2['Weekday']=='Sunday'), 1, 0)





print(df_new2.shape)

print(df_new2.head(20))





df_new3=df_new2[['Lunes', 'Martes', 'Miércoles', 'Jueves', 'Viernes', 'Sábado', 'Domingo', 'Madrugada', 'Mañana', 'Tarde', 'Noche', 'Volume']]



#Analysis

print(df_new3.shape)

print(df_new3.head(20))

print(dataset.describe())

print(df_new2.groupby('Weekday').size())

print(df_new3.groupby('Madrugada').size())

print(df_new3.groupby('Mañana').size())

print(df_new3.groupby('Tarde').size())

print(df_new3.groupby('Noche').size())

print(df_new3.groupby('Volume').size())

# box and whisker plots

df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False)

plt.show()

# histograms

df_new3.hist()

plt.show()

# scatter plot matrix

scatter_matrix(df_new3)

plt.show()



# Split-out validation dataset

array = df_new3.values

X = array[:,0:11]

#print(X)

Y = array[:,11]





#print(Y)

lab_enc = preprocessing.LabelEncoder()

encoded = lab_enc.fit_transform(Y)

Y=encoded

#print(Y)

print('')



validation_size = 0.20

seed = 7

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)





num_folds = 10

num_instances = len(X_train)

seed = 7

scoring = 'accuracy'





models = 



models.append(('LR', LogisticRegression())) #FUNCIONA!!!

models.append(('KNN', KNeighborsClassifier())) #FUNCIONA!!!

models.append(('CART', DecisionTreeClassifier())) #FUNCIONA!!!

models.append(('NB', GaussianNB())) # FUNCIONA!!!

models.append(('SVM', SVC())) #FUNCIONA!!!

# evaluate each model in turn

results = 

names = 



for name, model in models:

    kfold = model_selection.KFold(n_splits=10, random_state=seed)

    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)

    results.append(cv_results)

    names.append(name)

    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

    print(msg)





# Compare Algorithms

fig = plt.figure()

fig.suptitle('Algorithm Comparison')

ax = fig.add_subplot(111)

plt.boxplot(results)

ax.set_xticklabels(names)

plt.show()

This is the result I got after the model`s evaluation:

LR: 0.000000 (0.000000)

KNN: 0.000000 (0.000000)

CART: 0.000000 (0.000000)

NB: 0.000000 (0.000000)

SVM: 0.000000 (0.000000)

Thanking you in advance for any help...
José

edited Nov 20 at 10:53

desertnaut

15.6k53463

asked Nov 19 at 20:04

José Luis Martínez

258

Are you sure Y = array[:,11] is points at the correct labels?
– Geeocode
Nov 19 at 20:20

I believe so, I printed the array with print(Y) and got the desired result, feel free to download the program and try by yourself...
– José Luis Martínez
Nov 19 at 20:34

Welcome to SO; please see How to create a Minimal, Complete, and Verifiable example, as well as why a wall of code isn't helpful.
– desertnaut
Nov 19 at 20:45

Is Volume a continuous or categorical variable?
– vealkind
Nov 19 at 20:50

Originally it is a continuos variable, but after: 'lab_enc = preprocessing.LabelEncoder() encoded = lab_enc.fit_transform(Y) Y=encoded'. I believe it is transformed into a categorical one...
– José Luis Martínez
Nov 19 at 22:06

add a comment |

up vote
0
down vote

favorite

# -*- coding: utf-8 -*-

"""

Created on Sat Nov 10 15:18:54 2018

@author: ivonnics

"""



import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn import model_selection

from sklearn import preprocessing

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from pandas.plotting import scatter_matrix



url = "https://github.com/ivonnics/Machine-Learning/blob/master/CJD2.csv"

dataset = pd.read_html(url)

Tabla=dataset[0]

dataset=Tabla[['Date', 'Time', 'Volume']]



dataset1=[pd.to_datetime(hour, format="%I:%M:%S %p", errors="coerce") for hour in dataset['Time']]



print('-----------------------------------------------------------')

#print('TESTANDO')

dataset2=pd.Series(dataset1).dt.hour

#print(dataset2)

dataset3={'Hour': dataset2}

#print(dataset3)

dataset4=pd.DataFrame(dataset3, columns = ['Hour'])

#print(dataset4.head(20))



print(dataset.head(20))

print('-----------------------------------------------------------')

print(dataset.shape)

print('-----------------------------------------------------------')

print(dataset.describe())

print('-----------------------------------------------------------')





print(dataset.nunique())

print('-----------------------------------------------------------')



print('-----------------------------------------------------------')





df_new1= pd.concat([dataset, dataset4], axis=1)



print('-----------------------------------------------------------')

print(df_new1[(df_new1['Hour'] == 5)])

print('-----------------------------------------------------------')



dataset5=[pd.to_datetime(weekday, format="%m/%d/%Y", errors="coerce") for weekday in dataset['Date']]





dataset6=pd.Series(dataset5).dt.weekday_name

dataset7={'Weekday': dataset6}



dataset8=pd.DataFrame(dataset7, columns = ['Weekday'])



df_new2= pd.concat([df_new1, dataset8], axis=1)



df_new2['Madrugada'] = np.where((df_new2['Hour']>=0) & (df_new2['Hour']<6), 1, 0)

df_new2['Mañana'] = np.where((df_new2['Hour']>=6) & (df_new2['Hour']<12), 1, 0)

df_new2['Tarde'] = np.where((df_new2['Hour']>=12) & (df_new2['Hour']<18), 1, 0)

df_new2['Noche'] = np.where((df_new2['Hour']>=18) & (df_new2['Hour']<24), 1, 0)

df_new2['Lunes'] = np.where((df_new2['Weekday']=='Monday'), 1, 0)

df_new2['Martes'] = np.where((df_new2['Weekday']=='Tuesday'), 1, 0)

df_new2['Miércoles'] = np.where((df_new2['Weekday']=='Wednesday'), 1, 0)

df_new2['Jueves'] = np.where((df_new2['Weekday']=='Thursday'), 1, 0)

df_new2['Viernes'] = np.where((df_new2['Weekday']=='Friday'), 1, 0)

df_new2['Sábado'] = np.where((df_new2['Weekday']=='Saturday'), 1, 0)

df_new2['Domingo'] = np.where((df_new2['Weekday']=='Sunday'), 1, 0)





print(df_new2.shape)

print(df_new2.head(20))





df_new3=df_new2[['Lunes', 'Martes', 'Miércoles', 'Jueves', 'Viernes', 'Sábado', 'Domingo', 'Madrugada', 'Mañana', 'Tarde', 'Noche', 'Volume']]



#Analysis

print(df_new3.shape)

print(df_new3.head(20))

print(dataset.describe())

print(df_new2.groupby('Weekday').size())

print(df_new3.groupby('Madrugada').size())

print(df_new3.groupby('Mañana').size())

print(df_new3.groupby('Tarde').size())

print(df_new3.groupby('Noche').size())

print(df_new3.groupby('Volume').size())

# box and whisker plots

df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False)

plt.show()

# histograms

df_new3.hist()

plt.show()

# scatter plot matrix

scatter_matrix(df_new3)

plt.show()



# Split-out validation dataset

array = df_new3.values

X = array[:,0:11]

#print(X)

Y = array[:,11]





#print(Y)

lab_enc = preprocessing.LabelEncoder()

encoded = lab_enc.fit_transform(Y)

Y=encoded

#print(Y)

print('')



validation_size = 0.20

seed = 7

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)





num_folds = 10

num_instances = len(X_train)

seed = 7

scoring = 'accuracy'





models = 



models.append(('LR', LogisticRegression())) #FUNCIONA!!!

models.append(('KNN', KNeighborsClassifier())) #FUNCIONA!!!

models.append(('CART', DecisionTreeClassifier())) #FUNCIONA!!!

models.append(('NB', GaussianNB())) # FUNCIONA!!!

models.append(('SVM', SVC())) #FUNCIONA!!!

# evaluate each model in turn

results = 

names = 



for name, model in models:

    kfold = model_selection.KFold(n_splits=10, random_state=seed)

    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)

    results.append(cv_results)

    names.append(name)

    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

    print(msg)





# Compare Algorithms

fig = plt.figure()

fig.suptitle('Algorithm Comparison')

ax = fig.add_subplot(111)

plt.boxplot(results)

ax.set_xticklabels(names)

plt.show()

This is the result I got after the model`s evaluation:

LR: 0.000000 (0.000000)

KNN: 0.000000 (0.000000)

CART: 0.000000 (0.000000)

NB: 0.000000 (0.000000)

SVM: 0.000000 (0.000000)

Thanking you in advance for any help...
José

edited Nov 20 at 10:53

desertnaut

15.6k53463

asked Nov 19 at 20:04

José Luis Martínez

258

# -*- coding: utf-8 -*-

"""

Created on Sat Nov 10 15:18:54 2018

@author: ivonnics

"""



import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from sklearn import model_selection

from sklearn import preprocessing

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from pandas.plotting import scatter_matrix



url = "https://github.com/ivonnics/Machine-Learning/blob/master/CJD2.csv"

dataset = pd.read_html(url)

Tabla=dataset[0]

dataset=Tabla[['Date', 'Time', 'Volume']]



dataset1=[pd.to_datetime(hour, format="%I:%M:%S %p", errors="coerce") for hour in dataset['Time']]



print('-----------------------------------------------------------')

#print('TESTANDO')

dataset2=pd.Series(dataset1).dt.hour

#print(dataset2)

dataset3={'Hour': dataset2}

#print(dataset3)

dataset4=pd.DataFrame(dataset3, columns = ['Hour'])

#print(dataset4.head(20))



print(dataset.head(20))

print('-----------------------------------------------------------')

print(dataset.shape)

print('-----------------------------------------------------------')

print(dataset.describe())

print('-----------------------------------------------------------')





print(dataset.nunique())

print('-----------------------------------------------------------')



print('-----------------------------------------------------------')





df_new1= pd.concat([dataset, dataset4], axis=1)



print('-----------------------------------------------------------')

print(df_new1[(df_new1['Hour'] == 5)])

print('-----------------------------------------------------------')



dataset5=[pd.to_datetime(weekday, format="%m/%d/%Y", errors="coerce") for weekday in dataset['Date']]





dataset6=pd.Series(dataset5).dt.weekday_name

dataset7={'Weekday': dataset6}



dataset8=pd.DataFrame(dataset7, columns = ['Weekday'])



df_new2= pd.concat([df_new1, dataset8], axis=1)



df_new2['Madrugada'] = np.where((df_new2['Hour']>=0) & (df_new2['Hour']<6), 1, 0)

df_new2['Mañana'] = np.where((df_new2['Hour']>=6) & (df_new2['Hour']<12), 1, 0)

df_new2['Tarde'] = np.where((df_new2['Hour']>=12) & (df_new2['Hour']<18), 1, 0)

df_new2['Noche'] = np.where((df_new2['Hour']>=18) & (df_new2['Hour']<24), 1, 0)

df_new2['Lunes'] = np.where((df_new2['Weekday']=='Monday'), 1, 0)

df_new2['Martes'] = np.where((df_new2['Weekday']=='Tuesday'), 1, 0)

df_new2['Miércoles'] = np.where((df_new2['Weekday']=='Wednesday'), 1, 0)

df_new2['Jueves'] = np.where((df_new2['Weekday']=='Thursday'), 1, 0)

df_new2['Viernes'] = np.where((df_new2['Weekday']=='Friday'), 1, 0)

df_new2['Sábado'] = np.where((df_new2['Weekday']=='Saturday'), 1, 0)

df_new2['Domingo'] = np.where((df_new2['Weekday']=='Sunday'), 1, 0)





print(df_new2.shape)

print(df_new2.head(20))





df_new3=df_new2[['Lunes', 'Martes', 'Miércoles', 'Jueves', 'Viernes', 'Sábado', 'Domingo', 'Madrugada', 'Mañana', 'Tarde', 'Noche', 'Volume']]



#Analysis

print(df_new3.shape)

print(df_new3.head(20))

print(dataset.describe())

print(df_new2.groupby('Weekday').size())

print(df_new3.groupby('Madrugada').size())

print(df_new3.groupby('Mañana').size())

print(df_new3.groupby('Tarde').size())

print(df_new3.groupby('Noche').size())

print(df_new3.groupby('Volume').size())

# box and whisker plots

df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False)

plt.show()

# histograms

df_new3.hist()

plt.show()

# scatter plot matrix

scatter_matrix(df_new3)

plt.show()



# Split-out validation dataset

array = df_new3.values

X = array[:,0:11]

#print(X)

Y = array[:,11]





#print(Y)

lab_enc = preprocessing.LabelEncoder()

encoded = lab_enc.fit_transform(Y)

Y=encoded

#print(Y)

print('')



validation_size = 0.20

seed = 7

X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)





num_folds = 10

num_instances = len(X_train)

seed = 7

scoring = 'accuracy'





models = 



models.append(('LR', LogisticRegression())) #FUNCIONA!!!

models.append(('KNN', KNeighborsClassifier())) #FUNCIONA!!!

models.append(('CART', DecisionTreeClassifier())) #FUNCIONA!!!

models.append(('NB', GaussianNB())) # FUNCIONA!!!

models.append(('SVM', SVC())) #FUNCIONA!!!

# evaluate each model in turn

results = 

names = 



for name, model in models:

    kfold = model_selection.KFold(n_splits=10, random_state=seed)

    cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)

    results.append(cv_results)

    names.append(name)

    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

    print(msg)





# Compare Algorithms

fig = plt.figure()

fig.suptitle('Algorithm Comparison')

ax = fig.add_subplot(111)

plt.boxplot(results)

ax.set_xticklabels(names)

plt.show()

This is the result I got after the model`s evaluation:

LR: 0.000000 (0.000000)

KNN: 0.000000 (0.000000)

CART: 0.000000 (0.000000)

NB: 0.000000 (0.000000)

SVM: 0.000000 (0.000000)

Thanking you in advance for any help...
José

python machine-learning scikit-learn

edited Nov 20 at 10:53

desertnaut

15.6k53463

asked Nov 19 at 20:04

José Luis Martínez

258

edited Nov 20 at 10:53

desertnaut

15.6k53463

asked Nov 19 at 20:04

José Luis Martínez

258

edited Nov 20 at 10:53

desertnaut

15.6k53463

edited Nov 20 at 10:53

desertnaut

15.6k53463

edited Nov 20 at 10:53

desertnaut

15.6k53463

asked Nov 19 at 20:04

José Luis Martínez

258

asked Nov 19 at 20:04

José Luis Martínez

258

asked Nov 19 at 20:04

José Luis Martínez

258

Are you sure Y = array[:,11] is points at the correct labels?
– Geeocode
Nov 19 at 20:20

I believe so, I printed the array with print(Y) and got the desired result, feel free to download the program and try by yourself...
– José Luis Martínez
Nov 19 at 20:34

Welcome to SO; please see How to create a Minimal, Complete, and Verifiable example, as well as why a wall of code isn't helpful.
– desertnaut
Nov 19 at 20:45

Is Volume a continuous or categorical variable?
– vealkind
Nov 19 at 20:50

Originally it is a continuos variable, but after: 'lab_enc = preprocessing.LabelEncoder() encoded = lab_enc.fit_transform(Y) Y=encoded'. I believe it is transformed into a categorical one...
– José Luis Martínez
Nov 19 at 22:06

add a comment |

Are you sure Y = array[:,11] is points at the correct labels?
– Geeocode
Nov 19 at 20:20

I believe so, I printed the array with print(Y) and got the desired result, feel free to download the program and try by yourself...
– José Luis Martínez
Nov 19 at 20:34

Welcome to SO; please see How to create a Minimal, Complete, and Verifiable example, as well as why a wall of code isn't helpful.
– desertnaut
Nov 19 at 20:45

Is Volume a continuous or categorical variable?
– vealkind
Nov 19 at 20:50

Originally it is a continuos variable, but after: 'lab_enc = preprocessing.LabelEncoder() encoded = lab_enc.fit_transform(Y) Y=encoded'. I believe it is transformed into a categorical one...
– José Luis Martínez
Nov 19 at 22:06

Are you sure Y = array[:,11] is points at the correct labels?
– Geeocode
Nov 19 at 20:20

I believe so, I printed the array with print(Y) and got the desired result, feel free to download the program and try by yourself...
– José Luis Martínez
Nov 19 at 20:34

Welcome to SO; please see How to create a Minimal, Complete, and Verifiable example, as well as why a wall of code isn't helpful.
– desertnaut
Nov 19 at 20:45

Is Volume a continuous or categorical variable?
– vealkind
Nov 19 at 20:50

Originally it is a continuos variable, but after: 'lab_enc = preprocessing.LabelEncoder() encoded = lab_enc.fit_transform(Y) Y=encoded'. I believe it is transformed into a categorical one...
– José Luis Martínez
Nov 19 at 22:06

add a comment |

1 Answer
1

active

oldest

votes

up vote
1
down vote

accepted

The problem in your case, that you have 621 samples with 593 unique labels.
That's why deterministic estimators can not find any learned value to any of your validation sample after Kfold (Actually you can find some minimal accuracy with StratifiedKfold at nfold=2 by KNN and CART, but it is not important now).

print(len(Y))

print(len(np.unique(Y)))

Out:

621

593

Test it and make a trick, actually some funny kind of augmentation for testing purposes before of train_test_split:

X = 5 * list(X)

Y = 5 * list(Y)

and your results will be much better right away:

LR: 0.015700 (0.000403)

KNN: 0.028583 (0.000403)

CART: 0.018519 (0.001610)

NB: 0.018519 (0.001610)

SVM: 0.010870 (0.000403)

So in your original case, at each validation step the estimator will face a sample and will estimate a label probabilities (or label) for it, but will get a validation(test) label which will be different from the learned one(s).
As a result it will return 0.00 accuracy.

To better understand say we have

0100000000 256

0100000000 675

0100000000 912

In your training set, after it has been splitted into train and test sets.
The estimator will learn it. Because of the relatively huge number of unique labels the validation set will contain the following:

0100000000 112

0100000000 745

0100000000 312

Then it tries to estimate the proper labels, to the value:

0100000000

which will be something like this depending on estimator and its option:

{256: 0.333, 675: 0.333, 912: 0.333}

So the validation(test) accuracies then:

0100000000 112 at this label: 0.00

0100000000 745 at this label: 0.00

0100000000 312 at this label: 0.00

I hope is it clear now to everybody.

edited Nov 20 at 5:15

answered Nov 19 at 22:13

Geeocode

2,0461819

Liked your augmentation approach! => More data to dilute the lack of capability from the deterministic estimators to find learned values in the validation samples...
– José Luis Martínez
Nov 20 at 0:22

However, the suggested reason (having 621 samples with 593 unique labels) is not the cause of the problem, as after inserting df_new3=df_new3.drop_duplicates(subset=['Volume']) just before df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False) you would get 593 samples with 593 unique labels. Even with that the result continue to be: LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)
– José Luis Martínez
Nov 20 at 1:40

@JoséLuisMartínez Then you didn't understood my explanation about reason. The last section explain actually what's happening. I add a little more concrete explanation yet.
– Geeocode
Nov 20 at 4:55

@JoséLuisMartínez I am happy, if I could help you José Luis.
– Geeocode
Nov 21 at 22:19

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53381853%2fscikitlearn-ml-models-got-cv-results-mean-0-and-cv-results-std-0%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
1
down vote

accepted

print(len(Y))

print(len(np.unique(Y)))

Out:

621

593

Test it and make a trick, actually some funny kind of augmentation for testing purposes before of train_test_split:

X = 5 * list(X)

Y = 5 * list(Y)

and your results will be much better right away:

LR: 0.015700 (0.000403)

KNN: 0.028583 (0.000403)

CART: 0.018519 (0.001610)

NB: 0.018519 (0.001610)

SVM: 0.010870 (0.000403)

To better understand say we have

0100000000 256

0100000000 675

0100000000 912

0100000000 112

0100000000 745

0100000000 312

Then it tries to estimate the proper labels, to the value:

0100000000

which will be something like this depending on estimator and its option:

{256: 0.333, 675: 0.333, 912: 0.333}

So the validation(test) accuracies then:

0100000000 112 at this label: 0.00

0100000000 745 at this label: 0.00

0100000000 312 at this label: 0.00

I hope is it clear now to everybody.

edited Nov 20 at 5:15

answered Nov 19 at 22:13

Geeocode

2,0461819

Liked your augmentation approach! => More data to dilute the lack of capability from the deterministic estimators to find learned values in the validation samples...
– José Luis Martínez
Nov 20 at 0:22

However, the suggested reason (having 621 samples with 593 unique labels) is not the cause of the problem, as after inserting df_new3=df_new3.drop_duplicates(subset=['Volume']) just before df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False) you would get 593 samples with 593 unique labels. Even with that the result continue to be: LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)
– José Luis Martínez
Nov 20 at 1:40

@JoséLuisMartínez Then you didn't understood my explanation about reason. The last section explain actually what's happening. I add a little more concrete explanation yet.
– Geeocode
Nov 20 at 4:55

@JoséLuisMartínez I am happy, if I could help you José Luis.
– Geeocode
Nov 21 at 22:19

add a comment |

up vote
1
down vote

accepted

print(len(Y))

print(len(np.unique(Y)))

Out:

621

593

Test it and make a trick, actually some funny kind of augmentation for testing purposes before of train_test_split:

X = 5 * list(X)

Y = 5 * list(Y)

and your results will be much better right away:

LR: 0.015700 (0.000403)

KNN: 0.028583 (0.000403)

CART: 0.018519 (0.001610)

NB: 0.018519 (0.001610)

SVM: 0.010870 (0.000403)

To better understand say we have

0100000000 256

0100000000 675

0100000000 912

0100000000 112

0100000000 745

0100000000 312

Then it tries to estimate the proper labels, to the value:

0100000000

which will be something like this depending on estimator and its option:

{256: 0.333, 675: 0.333, 912: 0.333}

So the validation(test) accuracies then:

0100000000 112 at this label: 0.00

0100000000 745 at this label: 0.00

0100000000 312 at this label: 0.00

I hope is it clear now to everybody.

edited Nov 20 at 5:15

answered Nov 19 at 22:13

Geeocode

2,0461819

Liked your augmentation approach! => More data to dilute the lack of capability from the deterministic estimators to find learned values in the validation samples...
– José Luis Martínez
Nov 20 at 0:22

However, the suggested reason (having 621 samples with 593 unique labels) is not the cause of the problem, as after inserting df_new3=df_new3.drop_duplicates(subset=['Volume']) just before df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False) you would get 593 samples with 593 unique labels. Even with that the result continue to be: LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)
– José Luis Martínez
Nov 20 at 1:40

@JoséLuisMartínez Then you didn't understood my explanation about reason. The last section explain actually what's happening. I add a little more concrete explanation yet.
– Geeocode
Nov 20 at 4:55

@JoséLuisMartínez I am happy, if I could help you José Luis.
– Geeocode
Nov 21 at 22:19

add a comment |

up vote
1
down vote

accepted

print(len(Y))

print(len(np.unique(Y)))

Out:

621

593

Test it and make a trick, actually some funny kind of augmentation for testing purposes before of train_test_split:

X = 5 * list(X)

Y = 5 * list(Y)

and your results will be much better right away:

LR: 0.015700 (0.000403)

KNN: 0.028583 (0.000403)

CART: 0.018519 (0.001610)

NB: 0.018519 (0.001610)

SVM: 0.010870 (0.000403)

To better understand say we have

0100000000 256

0100000000 675

0100000000 912

0100000000 112

0100000000 745

0100000000 312

Then it tries to estimate the proper labels, to the value:

0100000000

which will be something like this depending on estimator and its option:

{256: 0.333, 675: 0.333, 912: 0.333}

So the validation(test) accuracies then:

0100000000 112 at this label: 0.00

0100000000 745 at this label: 0.00

0100000000 312 at this label: 0.00

I hope is it clear now to everybody.

edited Nov 20 at 5:15

answered Nov 19 at 22:13

Geeocode

2,0461819

print(len(Y))

print(len(np.unique(Y)))

Out:

621

593

Test it and make a trick, actually some funny kind of augmentation for testing purposes before of train_test_split:

X = 5 * list(X)

Y = 5 * list(Y)

and your results will be much better right away:

LR: 0.015700 (0.000403)

KNN: 0.028583 (0.000403)

CART: 0.018519 (0.001610)

NB: 0.018519 (0.001610)

SVM: 0.010870 (0.000403)

To better understand say we have

0100000000 256

0100000000 675

0100000000 912

0100000000 112

0100000000 745

0100000000 312

Then it tries to estimate the proper labels, to the value:

0100000000

which will be something like this depending on estimator and its option:

{256: 0.333, 675: 0.333, 912: 0.333}

So the validation(test) accuracies then:

0100000000 112 at this label: 0.00

0100000000 745 at this label: 0.00

0100000000 312 at this label: 0.00

I hope is it clear now to everybody.

edited Nov 20 at 5:15

answered Nov 19 at 22:13

Geeocode

2,0461819

edited Nov 20 at 5:15

answered Nov 19 at 22:13

Geeocode

2,0461819

answered Nov 19 at 22:13

Geeocode

2,0461819

answered Nov 19 at 22:13

Geeocode

2,0461819

Liked your augmentation approach! => More data to dilute the lack of capability from the deterministic estimators to find learned values in the validation samples...
– José Luis Martínez
Nov 20 at 0:22

However, the suggested reason (having 621 samples with 593 unique labels) is not the cause of the problem, as after inserting df_new3=df_new3.drop_duplicates(subset=['Volume']) just before df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False) you would get 593 samples with 593 unique labels. Even with that the result continue to be: LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)
– José Luis Martínez
Nov 20 at 1:40

@JoséLuisMartínez Then you didn't understood my explanation about reason. The last section explain actually what's happening. I add a little more concrete explanation yet.
– Geeocode
Nov 20 at 4:55

@JoséLuisMartínez I am happy, if I could help you José Luis.
– Geeocode
Nov 21 at 22:19

add a comment |

Liked your augmentation approach! => More data to dilute the lack of capability from the deterministic estimators to find learned values in the validation samples...
– José Luis Martínez
Nov 20 at 0:22

However, the suggested reason (having 621 samples with 593 unique labels) is not the cause of the problem, as after inserting df_new3=df_new3.drop_duplicates(subset=['Volume']) just before df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False) you would get 593 samples with 593 unique labels. Even with that the result continue to be: LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)
– José Luis Martínez
Nov 20 at 1:40

@JoséLuisMartínez Then you didn't understood my explanation about reason. The last section explain actually what's happening. I add a little more concrete explanation yet.
– Geeocode
Nov 20 at 4:55

@JoséLuisMartínez I am happy, if I could help you José Luis.
– Geeocode
Nov 21 at 22:19

Liked your augmentation approach! => More data to dilute the lack of capability from the deterministic estimators to find learned values in the validation samples...
– José Luis Martínez
Nov 20 at 0:22

However, the suggested reason (having 621 samples with 593 unique labels) is not the cause of the problem, as after inserting df_new3=df_new3.drop_duplicates(subset=['Volume']) just before df_new3.plot(kind='box', subplots=True, layout=(4,3), sharex=False, sharey=False) you would get 593 samples with 593 unique labels. Even with that the result continue to be:

LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)

– José Luis Martínez
Nov 20 at 1:40

LR: 0.000000 (0.000000) KNN: 0.000000 (0.000000) CART: 0.000000 (0.000000) NB: 0.000000 (0.000000) SVM: 0.000000 (0.000000)

– José Luis Martínez
Nov 20 at 1:40

@JoséLuisMartínez Then you didn't understood my explanation about reason. The last section explain actually what's happening. I add a little more concrete explanation yet.
– Geeocode
Nov 20 at 4:55

@JoséLuisMartínez I am happy, if I could help you José Luis.
– Geeocode
Nov 21 at 22:19

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Ytukyg