WineQuality_End_to_End_ML_Raja_Kartheek_Project.ipynb is the file:
Estimate Wine Quality Red
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import scatter_matrix
Setting seed value using random
np.random.seed(42)
File path of Wine quality & Read the file
FilePath = “/cxldata/datasets/project/wine_quality_red.csv”
ReadFile = pd.read_csv(FilePath)
Look at the data - To get a brief understanding of the data we are dealing with
WQHeader = ReadFile.head()
print(WQHeader)
Split the file into test and train datasets
def split_train_test(data,test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices],data.iloc[test_indices]
train_set,test_set = split_train_test(ReadFile,0.25)
#print(train_set)
#print(test_set)
#Glimpse of your data - no of rows, how many nulls, floating type
print(ReadFile.info())
print(ReadFile.describe())
Counts of test sets and train sets
print(test_set.info())
print(train_set.info())
print(test_set.describe())
print(train_set.describe())
Plot Histogram
ReadFile.hist(bins=15,figsize=(20,15))
plt.show()
using hash lib
import hashlib
def test_set_check(identifier, test_ratio, hash):
# print(hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio)
return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio
def split_train_test_by_id(data, test_ratio, id_column, hash=hashlib.md5):
ids = data[id_column]
in_test_set = ids.apply(lambda ids: test_set_check(ids, test_ratio, hash))
print(in_test_set)
return data.loc[~in_test_set], data.loc[in_test_set]
Adds index column
WineFile_with_id = ReadFile.reset_index()
print(WineFile_with_id)
ids = WineFile_with_id[“index”]
intstset = test_set_check(ids,0.25,hashlib.md5)
print(“Printing intstset …”)
print(intstset)
train_set1,test_set1 = split_train_test_by_id(WineFile_with_id,0.25,“index”,hashlib.md5)
print(train_set1)
print(test_set1)
print(len(train_set1), “train +”, len(test_set1), “test”)
test_set1.head()
np.random.seed(1)
np.random.permutation(4)
from sklearn.model_selection import train_test_split
train_set3,test_set3 = train_test_split(ReadFile,test_size=0.25,random_state=42)
print(“train_set3”,len(train_set3),"+",“test_set3”,len(test_set3))
test_set3.head()
from sklearn.model_selection import StratifiedShuffleSplit
ReadFile[“quality”].value_counts()
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in split.split(ReadFile,ReadFile[“quality”]):
strat_train_set = ReadFile.loc[train_index]
strat_test_set = ReadFile.loc[test_index]
print(strat_train_set[“quality”].value_counts())
print(strat_test_set[“quality”].value_counts())
train_set_rnd, test_set_rnd = train_test_split(ReadFile, test_size=0.25, random_state=42)
def quality_cat_proportions(data):
return data[“quality”].value_counts() / len(data)
compare_props = pd.DataFrame({
“Overall”: quality_cat_proportions(ReadFile),
“Stratified”: quality_cat_proportions(strat_test_set),
“Random”: quality_cat_proportions(test_set_rnd),
}).sort_index()
compare_props[“Rand. %error”] = 100 * compare_props[“Random”] / compare_props[“Overall”] - 100
compare_props[“Strat. %error”] = 100 * compare_props[“Stratified”] / compare_props[“Overall”] - 100
compare_props
WineQ_strat_train = strat_train_set.copy()
Stratified Sampling Copy
WineQ_corr_matrix = WineQ_strat_train.corr()
print(“WineQ Correlation Matrix”)
print(WineQ_corr_matrix)
Correlation matrix for selected attributes
attributes = [“citric acid”,“sulphates”,“alcohol”,“quality”]
WineQ_corr_matrix[attributes]
Stratified Sample
WineQ_strat_train.describe()
Dropping Quality attribute
WineQ_strat_train = strat_train_set.drop(“quality”, axis=1) # drop labels for training set
WineQ_labels = strat_train_set[“quality”].copy()
print(“Wine Quality Strat”)
print(WineQ_strat_train)
check if the sample is a null
isn = WineQ_strat_train.isnull()
isn.any(axis=1)
Sample incomplete rows - Checking if there are any null value
sample_incomplete_rows = WineQ_strat_train[WineQ_strat_train.isnull().any(axis=1)]
sample_incomplete_rows
Quality attribute - Categorical
This is not continuous even though its a numerical attribute
Have discrete quality levels so need to convert it into one hot encoder
print(WineQ_labels)
WineQ_Hot_Encoder = WineQ_labels.to_numpy()
print(WineQ_Hot_Encoder)
One Hot Encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
WineQ_cat_1hot = encoder.fit_transform(WineQ_Hot_Encoder.reshape(-1,1))
WineQ_strat_HotEncoder = WineQ_cat_1hot.toarray()
print(“Wine Quality 1hot Encoder”)
print(WineQ_strat_HotEncoder)
Dense one hot encoder
cat_encoder = CategoricalEncoder(encoding=“onehot-dense”)
WineQ_strat_reshaped = WineQ_labels.values.reshape(-1, 1)
print(WineQ_strat_reshaped)
WineQ_strat_1hot = cat_encoder.fit_transform(WineQ_strat_reshaped)
print(“Wine Quality Strat one hot dense encoder”)
print(WineQ_strat_1hot)
cat_encoder.categories_
Performing scaling to the below columns or variables or attributes
As these look somewhat skewed
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from mlxtend.preprocessing import minmax_scaling
print(WineQ_strat_train)
WineQ_strat_train.hist()
Performing feature scaling and standardization
X1 = WineQ_strat_train.copy()
X2 = WineQ_strat_train.copy()
print(“Performing scaling both: min max scaling and standard scaling”)
print(‘Data after standard scaling’)
stndrd_cols = [‘alcohol’,‘chlorides’,‘citric acid’,‘free sulfur dioxide’,‘total sulfur dioxide’]
print(X1[stndrd_cols].value_counts())
print(“Printing value counts before standardization scaler”)
print(X1[‘alcohol’].value_counts())
print(X1[‘chlorides’].value_counts())
print(X1[‘citric acid’].value_counts())
print(X1[‘free sulfur dioxide’].value_counts())
print(X1[‘total sulfur dioxide’].value_counts())
StandardScaler().fit_transform(X1[stndrd_cols])
print(“Printing value counts after standardization scaler”)
print(X1[stndrd_cols])
print(X1[‘alcohol’].value_counts())
print(X1[‘chlorides’].value_counts())
print(X1[‘citric acid’].value_counts())
print(X1[‘free sulfur dioxide’].value_counts())
print(X1[‘total sulfur dioxide’].value_counts())
print(X1[stndrd_cols].info())
print(X1[stndrd_cols].describe())
print(‘Data before min-max scaling’)
print(“Printing value counts before min max scaler”)
print(X2[‘alcohol’].value_counts())
print(X2[‘chlorides’].value_counts())
print(X2[‘citric acid’].value_counts())
print(X2[‘free sulfur dioxide’].value_counts())
print(X2[‘total sulfur dioxide’].value_counts())
print(X2[stndrd_cols].value_counts())
minmax_scaling(X2, columns=stndrd_cols)
minmax_scaling().fit_transform(X2[stndrd_cols])
print(‘Data after min-max scaling’)
print(“Printing value counts after min max scaler”)
print(X2[stndrd_cols])
print(X2[stndrd_cols].value_counts())
print(X2[‘alcohol’].value_counts())
print(X2[‘chlorides’].value_counts())
print(X2[‘citric acid’].value_counts())
print(X2[‘free sulfur dioxide’].value_counts())
print(X2[‘total sulfur dioxide’].value_counts())
print(X2[stndrd_cols].info())
print(X2[stndrd_cols].describe())
Creating pipeline
Quality is a numerical attribute with discrete values
num_attribs = list(WineQ_strat_train)
print(“num attribs”)
print(num_attribs)
cat_attribs = [“quality”]
print(“cat attribs”)
print(cat_attribs)
No Nulls - So No imputer
No combined Attributes
Just only one function in the num_pipeline
num_pipeline = Pipeline([
(‘selector’, DataFrameSelector(num_attribs))
])
Categorical pipeline
Here the categorical variable is the dependent variable quality which is the output
cat_pipeline = Pipeline([
(‘selector’, DataFrameSelector(cat_attribs)),
(‘cat_encoder’, CategoricalEncoder(encoding=“onehot-dense”)),
])
num_pipeline.fit_transform(WineQ_strat_train)
Creating a clean slate copy of stratified sampling
WineQ_Strt_train = strat_train_set.copy()
print(‘Cat pipeline’)
WineQCat = cat_pipeline.fit_transform(WineQ_Strt_train)
print(WineQCat)
Not dropping the quality attribute - even though it is dependent variable
WineQ_strat_train = WineQ_Strt_train.drop(“quality”, axis=1)
Defined full pipeline
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
(“num_pipeline”,num_pipeline),
(“cat_pipeline”,cat_pipeline)
])
Perform full piepline
WineQ_prepared = full_pipeline.fit_transform(WineQ_Strt_train)
WineQ_DF = pd.DataFrame(WineQ_prepared)
print(WineQ_prepared.shape)
print(WineQ_prepared[:1])
print(WineQ_prepared)
print(WineQ_DF)
print(WineQ_DF.info())
print(WineQ_DF.describe())
print(WineQ_DF[0])
print(WineQ_DF[1])
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(WineQ_prepared,WineQ_labels)
Error Occurred:
/usr/local/anaconda/lib/python3.6/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=1000,
multi_class=‘auto’, n_jobs=None, penalty=‘l2’,
random_state=None, solver=‘lbfgs’, tol=0.0001, verbose=0,
warm_start=False)
Steps I have taken:
- Increased the iteration count
- Standardized the data using both min-max scaling and Standard scaler
Can you please look into this and suggest the steps?