import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import (
    FunctionTransformer,
    Normalizer,
    OneHotEncoder,
    StandardScaler,
    normalize,
    scale)
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score, accuracy_score
from sklearn.model_selection import GridSearchCV
import altair as alt
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.feature_extraction.text import CountVectorizer


cheese_df = pd.read_csv("data/cheese_data.csv")
cheese_df.head()


cheese_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042 entries, 0 to 1041
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   CheeseId              1042 non-null   int64  
 1   ManufacturerProvCode  1042 non-null   object 
 2   ManufacturingTypeEn   1042 non-null   object 
 3   MoisturePercent       1028 non-null   float64
 4   FlavourEn             801 non-null    object 
 5   CharacteristicsEn     643 non-null    object 
 6   Organic               1042 non-null   int64  
 7   CategoryTypeEn        1019 non-null   object 
 8   MilkTypeEn            1041 non-null   object 
 9   MilkTreatmentTypeEn   977 non-null    object 
 10  RindTypeEn            721 non-null    object 
 11  CheeseName            1042 non-null   object 
 12  FatLevel              1042 non-null   object 
dtypes: float64(1), int64(2), object(10)
memory usage: 106.0+ KB


cheese_df.shape

(1042, 13)


train_df, test_df = train_test_split(cheese_df, test_size=0.2, random_state=111)

X_train_big = train_df.drop(columns=["CheeseId", "FlavourEn", "CharacteristicsEn", "RindTypeEn", "CheeseName", "FatLevel"])
y_train_big = train_df["FatLevel"]

X_test = test_df.drop(columns=["CheeseId", "FlavourEn", "CharacteristicsEn", "RindTypeEn", "CheeseName", "FatLevel"])
y_test = test_df["FatLevel"]


X_train, X_valid, y_train, y_valid = train_test_split(X_train_big, y_train_big, test_size=0.25, random_state=0)


train_df.head(2)


test_df.head(2)


X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 624 entries, 798 to 84
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ManufacturerProvCode  624 non-null    object 
 1   ManufacturingTypeEn   624 non-null    object 
 2   MoisturePercent       616 non-null    float64
 3   Organic               624 non-null    int64  
 4   CategoryTypeEn        609 non-null    object 
 5   MilkTypeEn            623 non-null    object 
 6   MilkTreatmentTypeEn   587 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 39.0+ KB


X_train.describe(include="all", percentiles=[])


X_train.isnull().sum()

ManufacturerProvCode     0
ManufacturingTypeEn      0
MoisturePercent          8
Organic                  0
CategoryTypeEn          15
MilkTypeEn               1
MilkTreatmentTypeEn     37
dtype: int64


y_train.describe

<bound method NDFrame.describe of 798     higher fat
194     higher fat
115      lower fat
995     higher fat
44      higher fat
           ...    
160     higher fat
1029    higher fat
883      lower fat
811     higher fat
84      higher fat
Name: FatLevel, Length: 624, dtype: object>


y_train.value_counts()

lower fat     415
higher fat    209
Name: FatLevel, dtype: int64


y_train.isnull().sum()

0


fat_level_plot = alt.Chart(train_df).mark_circle(size=60, opacity=0.3
 ).encode(
    x=alt.X('CheeseId', title='Cheese ID', scale=alt.Scale(zero=False)),
    y=alt.Y('FatLevel:N', title='Fat Level'),
    color=alt.Color('FatLevel', legend=alt.Legend(title="Fat Level")),
    tooltip=['CheeseId', 'FatLevel']
).properties(
    title={
      "text": ['Fat Level in Train Data'], 
      "subtitle": ['A scatter plot showing the fat level for each cheese ID.'],
      "color": "black",
      "subtitleColor": "gray"
    },
    height=150,
    width=800
).interactive()

fat_level_plot


train_df["FatLevel"].value_counts(normalize=True)

lower fat     0.660264
higher fat    0.339736
Name: FatLevel, dtype: float64


fat_level_prop = alt.Chart(train_df).mark_bar().encode(
    x='count()',
    y='FatLevel',
    color='FatLevel'
).properties(
    title='Fat Level Counts in Train Data',
    height=100
)

fat_level_prop


fat_level_prop_1 = alt.Chart(train_df).mark_bar().encode(
    x='ManufacturerProvCode:N',
    y='count()',
    color='FatLevel'
).properties(
    title='Fat Level in ManufacturerProvCode',
    height=100
)


fat_level_prop_2 = alt.Chart(train_df).mark_bar().encode(
    x='ManufacturingTypeEn:N',
    y='count()',
    color='FatLevel'
).properties(
    title='Fat Level in ManufacturingTypeEn',
    height=100
)


fat_level_prop_3 = alt.Chart(train_df).mark_bar().encode(
    x='MoisturePercent:N',
    y='count()',
    color='FatLevel'
).properties(
    title='MoisturePercent',
    height=100,
    width=1000
)


fat_level_prop_4 = alt.Chart(train_df).mark_bar().encode(
    x='Organic:N',
    y='count()',
    color='FatLevel'
).properties(
    title='Fat Level in Organic',
    height=100
)


fat_level_prop_5 = alt.Chart(train_df).mark_bar().encode(
    x='CategoryTypeEn:N',
    y='count()',
    color='FatLevel'
).properties(
    title='Fat Level in CategoryTypeEn',
    height=100
)


fat_level_prop_6 = alt.Chart(train_df).mark_bar().encode(
    x='MilkTypeEn:N',
    y='count()',
    color='FatLevel'
).properties(
    title='Fat Level in MilkTypeEn',
    height=100
)


fat_level_prop_7 = alt.Chart(train_df).mark_bar().encode(
    x='MilkTreatmentTypeEn:N',
    y='count()',
    color='FatLevel'
).properties(
    title='Fat Level in MilkTreatmentTypeEn',
    height=100
)


fat_level_prop_1 | fat_level_prop_2


fat_level_prop_3


fat_level_prop_4 | fat_level_prop_5 | fat_level_prop_6 | fat_level_prop_7


train_df["MoisturePercent"].unique()

array([43. , 34. , 42. , 69. , 35. , 39. , 46. , 50. , 36. , 44. , 55. ,
        nan, 74. , 57. , 40. , 52. , 45. , 58. , 56. , 37. , 65. , 54. ,
       41. , 33. , 60. , 38. , 63. , 48. , 68. , 31. , 62. , 27. , 49. ,
       47. , 30. , 78. , 20. , 72. , 31.5, 32. , 53. , 70. , 80. , 29. ,
       47.9, 76. , 40.3, 26. , 49.4, 51.7, 59. , 64. , 61. , 83. , 42.8,
       67. , 12. , 17. , 22. , 23. , 21. , 51. , 42.6, 75. , 24. , 88. ])


train_df["ManufacturerProvCode"].unique()

array(['QC', 'ON', 'BC', 'NS', 'MB', 'PE', 'AB', 'NB', 'SK', 'NL'],
      dtype=object)


train_df["ManufacturerProvCode"].value_counts()

QC    637
ON     92
BC     51
NB     21
AB     12
NS      9
MB      7
PE      2
SK      1
NL      1
Name: ManufacturerProvCode, dtype: int64


train_df["ManufacturerProvCode"].value_counts(normalize=True)

QC    0.764706
ON    0.110444
BC    0.061224
NB    0.025210
AB    0.014406
NS    0.010804
MB    0.008403
PE    0.002401
SK    0.001200
NL    0.001200
Name: ManufacturerProvCode, dtype: float64


train_df["ManufacturingTypeEn"].unique()

array(['Farmstead', 'Industrial', 'Artisan'], dtype=object)


train_df["CategoryTypeEn"].unique()

array(['Fresh Cheese', 'Firm Cheese', 'Semi-soft Cheese', 'Soft Cheese',
       'Hard Cheese', nan, 'Veined Cheeses'], dtype=object)


train_df["MilkTypeEn"].unique()

array(['Ewe', 'Cow', 'Goat', 'Cow and Goat', 'Ewe and Cow',
       'Cow, Goat and Ewe', nan, 'Ewe and Goat', 'Buffalo Cow'],
      dtype=object)


train_df["MilkTreatmentTypeEn"].unique()

array(['Pasteurized', 'Raw Milk', 'Thermised', nan], dtype=object)


train_df["Organic"].unique()

array([0, 1])


# Baseline
dummy = DummyClassifier(strategy="most_frequent")
pd.DataFrame(cross_validate(dummy, X_train, y_train, return_train_score=True)).mean()

fit_time       0.001775
score_time     0.000768
test_score     0.665071
train_score    0.665065
dtype: float64


dummy = DummyClassifier(strategy="stratified")

# Helped by Copilot
# The pos_label parameter is used in binary classification tasks to specify which of the two classes is the positive class. 
# In your case, it seems like your classes are ‘higher fat’ and ‘lower fat’, not 0 and 1.
scoring = {
    "accuracy": "accuracy",
    "f1": make_scorer(f1_score, pos_label='higher fat'),
    "recall": make_scorer(recall_score, pos_label='higher fat'),
    "precision": make_scorer(precision_score, pos_label='higher fat')
}
cv_results = cross_validate(dummy, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)
dummy_scores = pd.DataFrame(cv_results)
dummy_scores


numeric_features = ["MoisturePercent"]


categorical_features = ["ManufacturerProvCode", "ManufacturingTypeEn", "CategoryTypeEn",
                        "MilkTypeEn", "MilkTreatmentTypeEn"]


binary_features = ["Organic"]


numeric_transformer = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore")
)

binary_transformer = make_pipeline(
    SimpleImputer(strategy="constant"),
    OneHotEncoder(drop="if_binary", dtype=int)
)


preprocessor = make_column_transformer(
            (numeric_transformer, numeric_features),
            (categorical_transformer, categorical_features),
            (binary_transformer, binary_features),
            remainder="passthrough"
)


dummy_pipe = make_pipeline(preprocessor, DummyClassifier(strategy="most_frequent"))
dummy_pipe.fit(X_train, y_train)
cv_results = cross_validate(dummy_pipe, X_train, y_train, return_train_score=True)
pd.DataFrame(cv_results).mean()

fit_time       0.030950
score_time     0.015391
test_score     0.665071
train_score    0.665065
dtype: float64


dummy_pipe = make_pipeline(preprocessor, DummyClassifier(strategy="stratified"))

# Helped by Copilot
# The pos_label parameter is used in binary classification tasks to specify which of the two classes is the positive class. 
# In your case, it seems like your classes are ‘higher fat’ and ‘lower fat’, not 0 and 1.
scoring = {
    "accuracy": "accuracy",
    "precision": make_scorer(precision_score, pos_label='higher fat'),
    "recall": make_scorer(recall_score, pos_label='higher fat'),
    "f1": make_scorer(f1_score, pos_label='higher fat'),
}
cv_results = cross_validate(dummy_pipe, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)
dummy_scores = pd.DataFrame(cv_results)
dummy_scores.mean()

fit_time           0.034086
score_time         0.020785
test_accuracy      0.557755
train_accuracy     0.556489
test_precision     0.327905
train_precision    0.339010
test_recall        0.330546
train_recall       0.340954
test_f1            0.326847
train_f1           0.339577
dtype: float64


scoring = {
    "accuracy": "accuracy",
    "precision": make_scorer(precision_score, pos_label='higher fat'),
    "recall": make_scorer(recall_score, pos_label='higher fat'),
    "f1": make_scorer(f1_score, pos_label='higher fat'),
}


pipeline = Pipeline([
    ('preprocessor', preprocessor), 
    ('knn', KNeighborsClassifier())  
])

param_grid = {
    "knn__n_neighbors" : [1, 5, 10, 20, 30, 40, 50],
    "knn__weights" : ['uniform', 'distance']
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

grid_scores = pd.DataFrame(grid_search.cv_results_)
grid_scores.mean()

Fitting 5 folds for each of 14 candidates, totalling 70 fits

mean_fit_time              0.027620
std_fit_time               0.002497
mean_score_time            0.021333
std_score_time             0.002010
param_knn__n_neighbors    22.285714
split0_test_score          0.793143
split1_test_score          0.801714
split2_test_score          0.793714
split3_test_score          0.817714
split4_test_score          0.784562
mean_test_score            0.798170
std_test_score             0.024497
rank_test_score            7.428571
dtype: float64


best_hyperparams = grid_search.best_params_
print(best_hyperparams)

test_score = grid_search.score(X_test, y_test)
print(test_score)

{'knn__n_neighbors': 50, 'knn__weights': 'distance'}
0.861244019138756


pipe = make_pipeline(preprocessor, SVC(class_weight="balanced"))

param_grid = {
    "svc__gamma": [0.1, 1.0, 10, 100],
    "svc__C": [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(pipe, param_grid, cv=5, return_train_score=True, scoring='accuracy', refit=True)
grid_search.fit(X_train, y_train)
multi_scores = pd.DataFrame(grid_search.cv_results_)
multi_scores.mean()

mean_fit_time          0.044823
std_fit_time           0.003300
mean_score_time        0.017054
std_score_time         0.001983
param_svc__C          27.775000
param_svc__gamma      27.775000
split0_test_score      0.731000
split1_test_score      0.751500
split2_test_score      0.759000
split3_test_score      0.778500
split4_test_score      0.696069
mean_test_score        0.743214
std_test_score         0.033558
rank_test_score        8.500000
split0_train_score     0.864354
split1_train_score     0.862851
split2_train_score     0.868236
split3_train_score     0.864354
split4_train_score     0.869875
mean_train_score       0.865934
std_train_score        0.008048
dtype: float64


pipe = make_pipeline(preprocessor, RandomForestClassifier(class_weight="balanced", random_state=77))
pipe.fit(X_train, y_train);

cv_results = cross_validate(pipe, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)
scores = pd.DataFrame(cv_results)
scores.mean()

fit_time           0.314816
score_time         0.037166
test_accuracy      0.801290
train_accuracy     0.952726
test_precision     0.707939
train_precision    0.896336
test_recall        0.703136
train_recall       0.971300
test_f1            0.700616
train_f1           0.932294
dtype: float64


# Randomized hyperparameter optimization
param_grid = {
    "randomforestclassifier__max_depth": range(1,151,10)
}

depth_search = RandomizedSearchCV(pipe, param_grid, n_jobs=-1, cv=5,
                                  return_train_score=True, n_iter=5,
                                  random_state=77, verbose=3)

depth_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END randomforestclassifier__max_depth=141;, score=(train=0.958, test=0.792) total time=   0.3s
[CV 2/5] END randomforestclassifier__max_depth=141;, score=(train=0.946, test=0.816) total time=   0.3s
[CV 3/5] END randomforestclassifier__max_depth=141;, score=(train=0.956, test=0.752) total time=   0.3s
[CV 4/5] END randomforestclassifier__max_depth=141;, score=(train=0.956, test=0.840) total time=   0.3s
[CV 5/5] END randomforestclassifier__max_depth=141;, score=(train=0.948, test=0.806) total time=   0.3s
[CV 1/5] END randomforestclassifier__max_depth=61;, score=(train=0.958, test=0.792) total time=   0.3s
[CV 2/5] END randomforestclassifier__max_depth=61;, score=(train=0.946, test=0.816) total time=   0.3s
[CV 3/5] END randomforestclassifier__max_depth=61;, score=(train=0.956, test=0.752) total time=   0.3s
[CV 4/5] END randomforestclassifier__max_depth=61;, score=(train=0.956, test=0.840) total time=   0.3s
[CV 5/5] END randomforestclassifier__max_depth=61;, score=(train=0.948, test=0.806) total time=   0.4s
[CV 1/5] END randomforestclassifier__max_depth=21;, score=(train=0.958, test=0.792) total time=   0.3s
[CV 2/5] END randomforestclassifier__max_depth=21;, score=(train=0.946, test=0.816) total time=   0.4s
[CV 3/5] END randomforestclassifier__max_depth=21;, score=(train=0.956, test=0.752) total time=   0.3s
[CV 4/5] END randomforestclassifier__max_depth=21;, score=(train=0.956, test=0.840) total time=   0.4s
[CV 5/5] END randomforestclassifier__max_depth=21;, score=(train=0.948, test=0.806) total time=   0.4s
[CV 1/5] END randomforestclassifier__max_depth=91;, score=(train=0.958, test=0.792) total time=   0.3s
[CV 2/5] END randomforestclassifier__max_depth=91;, score=(train=0.946, test=0.816) total time=   0.3s
[CV 3/5] END randomforestclassifier__max_depth=91;, score=(train=0.956, test=0.752) total time=   0.3s
[CV 4/5] END randomforestclassifier__max_depth=91;, score=(train=0.956, test=0.840) total time=   0.4s
[CV 5/5] END randomforestclassifier__max_depth=91;, score=(train=0.948, test=0.806) total time=   0.4s
[CV 1/5] END randomforestclassifier__max_depth=31;, score=(train=0.958, test=0.792) total time=   0.3s
[CV 2/5] END randomforestclassifier__max_depth=31;, score=(train=0.946, test=0.816) total time=   0.3s
[CV 3/5] END randomforestclassifier__max_depth=31;, score=(train=0.956, test=0.752) total time=   0.3s
[CV 4/5] END randomforestclassifier__max_depth=31;, score=(train=0.956, test=0.840) total time=   0.3s
[CV 5/5] END randomforestclassifier__max_depth=31;, score=(train=0.948, test=0.806) total time=   0.3s

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['MoisturePercent']),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='missing',
                                                                                                              s...
                                                                                                SimpleImputer(strategy='constant')),
                                                                                               ('onehotencoder',
                                                                                                OneHotEncoder(drop='if_binary',
                                                                                                              dtype=<class 'int'>))]),
                                                                               ['Organic'])])),
                                             ('randomforestclassifier',
                                              RandomForestClassifier(class_weight='balanced',
                                                                     random_state=77))]),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'randomforestclassifier__max_depth': range(1, 151, 10)},
                   random_state=77, return_train_score=True, verbose=3)


cv_results = depth_search.cv_results_
grid_results = pd.DataFrame({
    'mean_test_score': cv_results['mean_test_score'],
    'param_randomforestclassifier__max_depth': cv_results['param_randomforestclassifier__max_depth'],
    'mean_fit_time': cv_results['mean_fit_time'],
    'rank_test_score': cv_results['rank_test_score']
})

grid_results = grid_results.sort_values(by='rank_test_score')
grid_results


best_depth = depth_search.best_params_['randomforestclassifier__max_depth']
print(f'The best parameter: {best_depth}')

best_depth_score = depth_search.best_score_
print(f'The best score: {best_depth_score}')

test_score = depth_search.score(X_test, y_test)
print(f'The test score: {test_score}')

The best parameter: 141
The best score: 0.8012903225806453
The test score: 0.8421052631578947


depth_search.classes_
print(classification_report(y_test, depth_search.predict(X_test),
                            target_names=["higer fat", "lower fat"]))

              precision    recall  f1-score   support

   higer fat       0.78      0.79      0.78        75
   lower fat       0.88      0.87      0.88       134

    accuracy                           0.84       209
   macro avg       0.83      0.83      0.83       209
weighted avg       0.84      0.84      0.84       209


pipe = make_pipeline(preprocessor, RandomForestClassifier(class_weight="balanced", random_state=77, max_depth=141))
pipe.fit(X_train, y_train);

cv_results = cross_validate(pipe, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)
scores = pd.DataFrame(cv_results)
scores.mean()

fit_time           0.306594
score_time         0.036786
test_accuracy      0.801290
train_accuracy     0.952726
test_precision     0.707939
train_precision    0.896336
test_recall        0.703136
train_recall       0.971300
test_f1            0.700616
train_f1           0.932294
dtype: float64


param_grid = {
    'randomforestclassifier__n_estimators': [100, 200, 300, 400, 500],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
    'randomforestclassifier__max_features': ['auto', 'sqrt']              
}

grid_search = RandomizedSearchCV(pipe, param_grid, n_jobs=-1, cv=5,
                                  return_train_score=True, n_iter=5,
                                  random_state=77, verbose=1)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['MoisturePercent']),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='missing',
                                                                                                              s...
                                              RandomForestClassifier(class_weight='balanced',
                                                                     max_depth=141,
                                                                     random_state=77))]),
                   n_iter=5, n_jobs=-1,
                   param_distributions={'randomforestclassifier__max_features': ['auto',
                                                                                 'sqrt'],
                                        'randomforestclassifier__min_samples_leaf': [1,
                                                                                     2,
                                                                                     4],
                                        'randomforestclassifier__min_samples_split': [2,
                                                                                      5,
                                                                                      10],
                                        'randomforestclassifier__n_estimators': [100,
                                                                                 200,
                                                                                 300,
                                                                                 400,
                                                                                 500]},
                   random_state=77, return_train_score=True, verbose=1)


best_model = grid_search.best_estimator_
cv_results = cross_validate(best_model, X_train, y_train, scoring=scoring, return_train_score=True)
scores = pd.DataFrame(cv_results)
mean_scores = scores.mean()
mean_scores

fit_time           0.502473
score_time         0.052979
test_accuracy      0.831742
train_accuracy     0.899044
test_precision     0.734181
train_precision    0.830582
test_recall        0.779791
train_recall       0.878008
test_f1            0.753296
train_f1           0.853533
dtype: float64


grid_search.classes_
print(classification_report(y_test, depth_search.predict(X_test),
                            target_names=["higer fat", "lower fat"]))

              precision    recall  f1-score   support

   higer fat       0.78      0.79      0.78        75
   lower fat       0.88      0.87      0.88       134

    accuracy                           0.84       209
   macro avg       0.83      0.83      0.83       209
weighted avg       0.84      0.84      0.84       209


# Copilot suggestion to improve the model
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'randomforestclassifier__n_estimators': [100, 200, 300],
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestclassifier__max_depth' : [80, 90, 100, 110],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
}

rscv = RandomizedSearchCV(pipe, param_dist, n_jobs=1, cv=5, n_iter=10)

rscv.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(remainder='passthrough',
                                                                transformers=[('pipeline-1',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(strategy='median')),
                                                                                               ('standardscaler',
                                                                                                StandardScaler())]),
                                                                               ['MoisturePercent']),
                                                                              ('pipeline-2',
                                                                               Pipeline(steps=[('simpleimputer',
                                                                                                SimpleImputer(fill_value='missing',
                                                                                                              s...
                                              RandomForestClassifier(class_weight='balanced',
                                                                     max_depth=141,
                                                                     random_state=77))]),
                   n_jobs=1,
                   param_distributions={'randomforestclassifier__max_depth': [80,
                                                                              90,
                                                                              100,
                                                                              110],
                                        'randomforestclassifier__max_features': ['auto',
                                                                                 'sqrt',
                                                                                 'log2'],
                                        'randomforestclassifier__min_samples_leaf': [1,
                                                                                     2,
                                                                                     4],
                                        'randomforestclassifier__min_samples_split': [2,
                                                                                      5,
                                                                                      10],
                                        'randomforestclassifier__n_estimators': [100,
                                                                                 200,
                                                                                 300]})


best_model = rscv.best_estimator_
cv_results = cross_validate(best_model, X_train, y_train, cv=5, scoring=scoring, return_train_score=True)
scores = pd.DataFrame(cv_results)
mean_scores = scores.mean()
mean_scores

fit_time           0.738583
score_time         0.070373
test_accuracy      0.834929
train_accuracy     0.902247
test_precision     0.737791
train_precision    0.830040
test_recall        0.789199
train_recall       0.891175
test_f1            0.760142
train_f1           0.859355
dtype: float64


rscv.classes_
print(classification_report(y_test, depth_search.predict(X_test),
                            target_names=["higer fat", "lower fat"]))

              precision    recall  f1-score   support

   higer fat       0.78      0.79      0.78        75
   lower fat       0.88      0.87      0.88       134

    accuracy                           0.84       209
   macro avg       0.83      0.83      0.83       209
weighted avg       0.84      0.84      0.84       209


best_model = rscv.best_estimator_
best_model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['MoisturePercent']),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(fill_value='missing',
                                                                                 strategy='constant')),
                                                                  ('onehotencod...
                                                   'MilkTreatmentTypeEn']),
                                                 ('pipeline-3',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(drop='if_binary',
                                                                                 dtype=<class 'int'>))]),
                                                  ['Organic'])])),
                ('randomforestclassifier',
                 RandomForestClassifier(class_weight='balanced', max_depth=80,
                                        max_features='log2',
                                        min_samples_split=10, n_estimators=300,
                                        random_state=77))])


best_model.score(X_test, y_test)

0.861244019138756


from sklearn.metrics import  plot_confusion_matrix

plot_confusion_matrix(pipe, X_valid, y_valid, display_labels=["high fat", "Low fat"], values_format="d", cmap="YlGn");


from sklearn.metrics import confusion_matrix

predictions = pipe.predict(X_valid)
confusion_matrix(y_valid, predictions)

array([[ 56,  18],
       [ 16, 119]])


confusion_matrix(y_valid, predictions)

array([[ 56,  18],
       [ 16, 119]])


TN, FP, FN, TP = confusion_matrix(y_valid, predictions).ravel()


recall = TP / (TP + FN)
precision = TP / (TP + FP)
data = {}
data["accuracy"] = [(TP + TN) / (TN + FP + FN + TP)]
data["error"] = [(FP + FN) / (TN + FP + FN + TP)]
data["precision"] = [ TP / (TP + FP)] 
data["recall"] = [TP / (TP + FN)] 
data["f1 score"] = [(2 * precision * recall) / (precision + recall)] 
measures_df = pd.DataFrame(data, index=['ourselves'])
measures_df


predicted_y = pipe.predict(X_valid)

precision = precision_score(y_valid, predicted_y, pos_label='higher fat').round(3)
print("precision: ", precision)

recall = recall_score(y_valid, predicted_y, pos_label='higher fat').round(3)
print("recall: ", recall)

f1 = f1_score(y_valid, predicted_y, pos_label='higher fat').round(3)
print("f1:", f1)

print(classification_report(y_valid, predicted_y, digits=3))

precision:  0.778
recall:  0.757
f1: 0.767
              precision    recall  f1-score   support

  higher fat      0.778     0.757     0.767        74
   lower fat      0.869     0.881     0.875       135

    accuracy                          0.837       209
   macro avg      0.823     0.819     0.821       209
weighted avg      0.836     0.837     0.837       209


pipe.classes_

array(['higher fat', 'lower fat'], dtype=object)


print(classification_report(y_valid, pipe.predict(X_valid)))

              precision    recall  f1-score   support

  higher fat       0.78      0.76      0.77        74
   lower fat       0.87      0.88      0.87       135

    accuracy                           0.84       209
   macro avg       0.82      0.82      0.82       209
weighted avg       0.84      0.84      0.84       209


importances = pipe.named_steps['randomforestclassifier'].feature_importances_

print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, f, importances[f]))

Feature ranking:
1. feature 0 (0.514878)
2. feature 1 (0.005168)
3. feature 2 (0.014288)
4. feature 3 (0.004052)
5. feature 4 (0.004764)
6. feature 5 (0.004204)
7. feature 6 (0.016653)


X_train.columns

Index(['ManufacturerProvCode', 'ManufacturingTypeEn', 'MoisturePercent',
       'Organic', 'CategoryTypeEn', 'MilkTypeEn', 'MilkTreatmentTypeEn'],
      dtype='object')


# Initialize the LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
le = LabelEncoder()

# Fit label encoder and return encoded labels
y_train_binary = le.fit_transform(y_train)


y_train_binary

array([0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 0])


from sklearn.linear_model import Ridge

rm_pipe = make_pipeline(preprocessor, Ridge())

param_dist = {
    'ridge__alpha': [0.1, 1, 10, 100, 1000, 10000]
}

grid_search = GridSearchCV(
    rm_pipe,
    param_dist,
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train_binary)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('standardscaler',
                                                                                          StandardScaler())]),
                                                                         ['MoisturePercent']),
                                                                        ('pipeline-2',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(fill_value='missing',
                                                                                                        strateg...
                                                                         ['ManufacturerProvCode',
                                                                          'ManufacturingTypeEn',
                                                                          'CategoryTypeEn',
                                                                          'MilkTypeEn',
                                                                          'MilkTreatmentTypeEn']),
                                                                        ('pipeline-3',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer(strategy='constant')),
                                                                                         ('onehotencoder',
                                                                                          OneHotEncoder(drop='if_binary',
                                                                                                        dtype=<class 'int'>))]),
                                                                         ['Organic'])])),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'ridge__alpha': [0.1, 1, 10, 100, 1000, 10000]})


best_alpha = grid_search.best_params_
print(best_alpha)

{'ridge__alpha': 10}


best_score = grid_search.best_score_
print(best_score)

0.2617533611936226


from sklearn.metrics import make_scorer

# https://stackoverflow.com/questions/14861891/runtimewarning-invalid-value-encountered-in-divide
np.seterr(divide='ignore', invalid='ignore')

def mape(true, pred):
  
    # In this code, epsilon is a small constant added to avoid division by zero.
    epsilon = 1e-10  # small constant
    return 100.0 * np.mean(np.abs((pred - true) / (true + epsilon)))

mape_scorer = make_scorer(mape, greater_is_better=False)


scoring_dict = {"neg_mean_squared_error": "neg_mean_squared_error",
                "neg_root_mean_squared_error": "neg_root_mean_squared_error",
                "neg_mean_absolute_error": "neg_mean_absolute_error",
                "r2": "r2",
                "mape_scorer": mape_scorer
               }


regression_scores = pd.DataFrame(cross_validate(pipe, X_train, y_train_binary, return_train_score=True, scoring=scoring_dict))
regression_scores


regression_mean = regression_scores.mean()
regression_mean

fit_time                             2.998348e-01
score_time                           3.392415e-02
test_neg_mean_squared_error         -1.987097e-01
train_neg_mean_squared_error        -4.727375e-02
test_neg_root_mean_squared_error    -4.445919e-01
train_neg_root_mean_squared_error   -2.171470e-01
test_neg_mean_absolute_error        -1.987097e-01
train_neg_mean_absolute_error       -4.727375e-02
test_r2                              1.079412e-01
train_r2                             7.877836e-01
test_mape_scorer                    -9.938065e+10
train_mape_scorer                   -9.614429e+09
dtype: float64


train_df_copy = train_df.copy()
train_df_copy['combined'] = train_df_copy['FlavourEn'].astype(str) + ' ' + train_df_copy['CharacteristicsEn'].astype(str) + ' ' + train_df_copy['CategoryTypeEn'].astype(str) + ' ' + train_df_copy['MilkTypeEn'].astype(str) + ' ' + train_df_copy['RindTypeEn'].astype(str)

cv = CountVectorizer()

vector = cv.fit_transform(train_df_copy['combined'])

combined_df = pd.DataFrame(vector.toarray(), columns=cv.get_feature_names())
combined_df


combined_df.columns

Index(['100', '16', '20', '25', '41', '50', 'accent', 'accents', 'accentuated',
       'accompanied',
       ...
       'year', 'years', 'yellow', 'yet', 'yeux', 'yogurt', 'young', 'younger',
       'your', 'yr'],
      dtype='object', length=806)


train_df_copy = train_df.copy()

train_df_copy['combined'] = train_df_copy['FlavourEn'].astype(str) + ' ' + train_df_copy['CharacteristicsEn'].astype(str) + ' ' + train_df_copy['CategoryTypeEn'].astype(str) + ' ' + train_df_copy['MilkTypeEn'].astype(str) + ' ' + train_df_copy['RindTypeEn'].astype(str)

cv = CountVectorizer()

vector = cv.fit_transform(train_df_copy['combined'])

combined_df = pd.DataFrame(vector.toarray(), columns=cv.get_feature_names())
combined_df


train_df_copy = train_df.copy()

train_df_copy['combined'] = train_df_copy['FlavourEn'].astype(str) + ' ' + train_df_copy['CharacteristicsEn'].astype(str) + ' ' + train_df_copy['CategoryTypeEn'].astype(str) + ' ' + train_df_copy['MilkTypeEn'].astype(str) + ' ' + train_df_copy['RindTypeEn'].astype(str)

cv = CountVectorizer()

vector = cv.fit_transform(train_df_copy['combined'])

combined_df = pd.DataFrame(vector.toarray(), columns=cv.get_feature_names())
combined_df


train_df_copy = train_df.copy()
train_df_copy['CheeseName'] = train_df_copy['CheeseName'].fillna(0)

vectorizer = CountVectorizer()

X_counts = vectorizer.fit_transform(train_df_copy['CheeseName'])

name_df = pd.DataFrame(X_counts.toarray(), columns=vectorizer.get_feature_names(), index=train_df_copy.index)
name_df


X = combined_df
y = name_df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=333)

dummy = DummyRegressor(strategy="mean")
dummy.fit(X_train, y_train);
cv_results = cross_validate(dummy, X_train, y_train, return_train_score=True)
pd.DataFrame(cv_results).mean()

fit_time       0.022749
score_time     0.014360
test_score     0.141416
train_score    0.268546
dtype: float64


rf = RandomForestRegressor(n_estimators=100, random_state=333)

rf.fit(X_train, y_train)

cv_results_rf = cross_validate(rf, X_train, y_train, return_train_score=True)
pd.DataFrame(cv_results_rf).mean()

fit_time       27.753072
score_time      0.066951
test_score      0.156343
train_score     0.830618
dtype: float64


pipe = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="median")),
                   ("scaler", StandardScaler()),
                   ("knn", KNeighborsRegressor())]
)


param_grid = {
    "knn__n_neighbors" : [1, 5, 10, 20, 30, 40, 50],
    "knn__weights" : ['uniform', 'distance']
}


grid_search = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 14 candidates, totalling 70 fits

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('imputer',
                                        SimpleImputer(strategy='median')),
                                       ('scaler', StandardScaler()),
                                       ('knn', KNeighborsRegressor())]),
             n_jobs=-1,
             param_grid={'knn__n_neighbors': [1, 5, 10, 20, 30, 40, 50],
                         'knn__weights': ['uniform', 'distance']},
             verbose=1)


best_hyperparams = grid_search.best_params_
print(best_hyperparams)

best_score = grid_search.best_score_
print(best_score)

{'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
0.5381757088249467


test_score = grid_search.score(X_test, y_test)
test_score

0.4562198860155578

	ManufacturerProvCode	ManufacturingTypeEn	MoisturePercent	Organic	CategoryTypeEn	MilkTypeEn	MilkTreatmentTypeEn
count	624	624	616.000000	624.000000	609	623	587
unique	9	3	NaN	NaN	6	8	3
top	QC	Industrial	NaN	NaN	Firm Cheese	Cow	Pasteurized
freq	477	266	NaN	NaN	211	447	476
mean	NaN	NaN	47.123539	0.088141	NaN	NaN	NaN
std	NaN	NaN	9.472985	0.283727	NaN	NaN	NaN
min	NaN	NaN	12.000000	0.000000	NaN	NaN	NaN
50%	NaN	NaN	46.000000	0.000000	NaN	NaN	NaN
max	NaN	NaN	83.000000	1.000000	NaN	NaN	NaN

	fit_time	score_time	test_accuracy	train_accuracy	test_f1	train_f1	test_recall	train_recall	test_precision	train_precision
0	0.002124	0.008933	0.488000	0.523046	0.304348	0.323864	0.333333	0.341317	0.280000	0.308108
1	0.001781	0.033461	0.544000	0.561122	0.329412	0.326154	0.333333	0.317365	0.325581	0.335443
2	0.001698	0.007775	0.488000	0.539078	0.255814	0.315476	0.261905	0.317365	0.250000	0.313609
3	0.001857	0.007133	0.496000	0.507014	0.307692	0.297143	0.333333	0.311377	0.285714	0.284153
4	0.001961	0.008153	0.548387	0.580000	0.333333	0.371257	0.341463	0.369048	0.325581	0.373494

	fit_time	score_time	test_neg_mean_squared_error	train_neg_mean_squared_error	test_neg_root_mean_squared_error	train_neg_root_mean_squared_error	test_neg_mean_absolute_error	train_neg_mean_absolute_error	test_r2	train_r2	test_mape_scorer	train_mape_scorer
0	0.302047	0.032917	-0.208000	-0.042084	-0.456070	-0.205144	-0.208000	-0.042084	0.067699	0.810998	-1.040000e+11	-8.016032e+09
1	0.302868	0.030299	-0.184000	-0.054108	-0.428952	-0.232612	-0.184000	-0.054108	0.175273	0.756998	-1.280000e+11	-1.002004e+10
2	0.293237	0.039144	-0.248000	-0.044088	-0.497996	-0.209972	-0.248000	-0.044088	-0.111589	0.801998	-1.200000e+11	-8.016032e+09
3	0.304344	0.035096	-0.160000	-0.044088	-0.400000	-0.209972	-0.160000	-0.044088	0.282846	0.801998	-3.200000e+10	-1.002004e+10
4	0.296677	0.032165	-0.193548	-0.052000	-0.439941	-0.228035	-0.193548	-0.052000	0.125478	0.766925	-1.129032e+11	-1.200000e+10

Final Project

Introduction

Exploratory Data Analysis (EDA)

Preprocessing

Methods & Results

Discussion

References

References

Main Resources I used

	CheeseId	ManufacturerProvCode	ManufacturingTypeEn	MoisturePercent	FlavourEn	CharacteristicsEn	Organic	CategoryTypeEn	MilkTypeEn	MilkTreatmentTypeEn	RindTypeEn	CheeseName	FatLevel
0	228	NB	Farmstead	47.0	Sharp, lactic	Uncooked	0	Firm Cheese	Ewe	Raw Milk	Washed Rind	Sieur de Duplessis (Le)	lower fat
1	242	NB	Farmstead	47.9	Sharp, lactic, lightly caramelized	Uncooked	0	Semi-soft Cheese	Cow	Raw Milk	Washed Rind	Tomme Le Champ Doré	lower fat
2	301	ON	Industrial	54.0	Mild, tangy, and fruity	Pressed and cooked cheese, pasta filata, inter...	0	Firm Cheese	Cow	Pasteurized	NaN	Provolone Sette Fette (Tre-Stelle)	lower fat
3	303	NB	Farmstead	47.0	Sharp with fruity notes and a hint of wild honey	NaN	0	Veined Cheeses	Cow	Raw Milk	NaN	Geai Bleu (Le)	lower fat
4	319	NB	Farmstead	49.4	Softer taste	NaN	1	Semi-soft Cheese	Cow	Raw Milk	Washed Rind	Gamin (Le)	lower fat

	CheeseId	ManufacturerProvCode	ManufacturingTypeEn	MoisturePercent	FlavourEn	CharacteristicsEn	Organic	CategoryTypeEn	MilkTypeEn	MilkTreatmentTypeEn	RindTypeEn	CheeseName	FatLevel
980	2306	QC	Farmstead	43.0	NaN	NaN	0	Fresh Cheese	Ewe	Pasteurized	No Rind	Fromage Frais	lower fat
361	1385	QC	Industrial	43.0	Mild, hazelnut flavour, slightly acidulous	Smooth with small holes, no rind	0	Firm Cheese	Cow	Pasteurized	NaN	Bergeron Classique	lower fat

	CheeseId	ManufacturerProvCode	ManufacturingTypeEn	MoisturePercent	FlavourEn	CharacteristicsEn	Organic	CategoryTypeEn	MilkTypeEn	MilkTreatmentTypeEn	RindTypeEn	CheeseName	FatLevel
524	1552	QC	Artisan	40.0	Very mild. Curds are aslo available BBQ seasoned	Daily fresh, withe cheese curds, twist or blocks.	0	Firm Cheese	Cow	Pasteurized	No Rind	Cheddar frais du jour (Fromagerie Perron)	higher fat
707	1776	QC	Industrial	57.0	Mild and salty	Compact, unripened	0	Soft Cheese	Cow	Pasteurized	No Rind	Akawie (Fromagerie Polyethnique)	lower fat

	mean_test_score	param_randomforestclassifier__max_depth	mean_fit_time	rank_test_score
0	0.80129	141	0.304560	1
1	0.80129	61	0.309368	1
2	0.80129	21	0.329453	1
3	0.80129	91	0.318003	1
4	0.80129	31	0.301924	1

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	115e	12	14	15	1608	1860	1ière	20	2ième	3ième	...	élan	élisabeth	élite	émile	érable	érables	étoile	évanjules	île	îles
980	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
361	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
119	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
374	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
118	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
681	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
86	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
724	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
876	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	115e	12	14	15	1608	1860	1ière	20	2ième	3ième	...	élan	élisabeth	élite	émile	érable	érables	étoile	évanjules	île	îles
980	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
361	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
119	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
374	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
118	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
681	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
86	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
724	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
876	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	100	16	20	25	41	50	accent	accents	accentuated	accompanied	...	year	years	yellow	yet	yeux	yogurt	young	younger	your	yr
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
3	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
4	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
828	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
829	0	0	0	0	0	0	0	0	0	0	...	0	0	1	0	0	0	0	0	0	0
830	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
831	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
832	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	115e	12	14	15	1608	1860	1ière	20	2ième	3ième	...	élan	élisabeth	élite	émile	érable	érables	étoile	évanjules	île	îles
980	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
361	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
130	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
119	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
374	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
118	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
681	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
86	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
724	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
876	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0