Statement of Completion#e08cd143
Numeric programming with Numpy
medium
From Grape to Glass: A NumPy Journey in Wine Data Cleaning
Resolution
Activities
Project.ipynb
Import Libraries¶
In [1]:
import numpy as np
print("Welcome to VinoVeritas Data Cleaning Project!")
Welcome to VinoVeritas Data Cleaning Project!
Activities¶
Activity 1: Uncorking the Data: Loading from CSV¶
In [74]:
wine_data = np.genfromtxt('winequality-red.csv',
delimiter=';',
skip_header=1,
filling_values=np.nan)
wine_data
Out[74]:
array([[ 7.4 , 0.7 , 0. , ..., 0.56 , 9.4 , 5. ], [ 7.8 , 0.88 , 0. , ..., 0.68 , 9.8 , 5. ], [ 7.8 , 0.76 , 0.04 , ..., 0.65 , 9.8 , 5. ], ..., [ 6.3 , 0.51 , 0.13 , ..., 0.75 , 11. , 6. ], [ 5.9 , 0.645, 0.12 , ..., 0.71 , 10.2 , 5. ], [ 6. , 0.31 , 0.47 , ..., 0.66 , nan, 6. ]], shape=(1599, 12))
In [75]:
wine_data.shape
Out[75]:
(1599, 12)
Activity 2: Searching for Ghosts: Locating the NaNs¶
In [76]:
nan_mask = np.isnan(wine_data)
total_nans = nan_mask.sum()
num_rows_with_nan = np.sum(np.sum(nan_mask, 1) > 0)
print(total_nans, num_rows_with_nan)
191 184
Activity 3: Patch Up the Barrels: Imputing Missing Data¶
In [ ]:
#np.sum(nan_mask, 1)
#np.sum(nan_mask, 1) <= 2
In [78]:
nan_mask = np.isnan(wine_data)
cleaned_data = wine_data[np.sum(nan_mask, 1) < 2]
print(cleaned_data.shape)
(1592, 12)
In [79]:
col_mean = np.nanmean(cleaned_data, axis=0)
inds = np.where(np.isnan(cleaned_data))
cleaned_data[inds] = np.take(col_mean, inds[1])
cleaned_data
Out[79]:
array([[ 7.4 , 0.7 , 0. , ..., 0.56 , 9.4 , 5. ], [ 7.8 , 0.88 , 0. , ..., 0.68 , 9.8 , 5. ], [ 7.8 , 0.76 , 0.04 , ..., 0.65 , 9.8 , 5. ], ..., [ 6.3 , 0.51 , 0.13 , ..., 0.75 , 11. , 6. ], [ 5.9 , 0.645 , 0.12 , ..., 0.71 , 10.2 , 5. ], [ 6. , 0.31 , 0.47 , ..., 0.66 , 10.4251746, 6. ]], shape=(1592, 12))
In [ ]:
##################
In [48]:
# 1. Count NaNs per row
row_nan_count = np.sum(np.isnan(wine_data), axis=1)
# 2. Keep only rows with < 2 NaNs
filtered_data = wine_data[row_nan_count < 2]
# 3. Fill remaining NaNs in each column with the column mean
col_means = np.nanmean(filtered_data, axis=0) # ignore NaNs when computing means
# Helper function to fill NaNs with col mean
def fill_nan_with_mean(data, col_means):
data_copy = data.copy()
nan_mask = np.isnan(data_copy)
for col_idx in range(data_copy.shape[1]):
data_copy[nan_mask[:, col_idx], col_idx] = col_means[col_idx]
return data_copy
cleaned_data = fill_nan_with_mean(filtered_data, col_means)
print("Original shape:", wine_data.shape)
print("Filtered shape:", filtered_data.shape)
print("Cleaned data shape:", cleaned_data.shape)
cleaned_data
Original shape: (1599, 12) Filtered shape: (1592, 12) Cleaned data shape: (1592, 12)
Out[48]:
array([[ 7.4 , 0.7 , 0. , ..., 0.56 , 9.4 , 5. ], [ 7.8 , 0.88 , 0. , ..., 0.68 , 9.8 , 5. ], [ 7.8 , 0.76 , 0.04 , ..., 0.65 , 9.8 , 5. ], ..., [ 6.3 , 0.51 , 0.13 , ..., 0.75 , 11. , 6. ], [ 5.9 , 0.645 , 0.12 , ..., 0.71 , 10.2 , 5. ], [ 6. , 0.31 , 0.47 , ..., 0.66 , 10.4251746, 6. ]], shape=(1592, 12))
Activity 4: Hunting for Outliers: Spotting Extreme Values¶
In [80]:
chemical_cols = cleaned_data[:,:-1]
data_mean = np.nanmean(chemical_cols, axis = 0)
data_std = np.nanstd(chemical_cols, axis = 0)
data_mean
Out[80]:
array([ 8.32889734, 0.52742079, 0.27076239, 2.54132362, 0.08746662, 15.85142857, 46.32575758, 0.99673744, 3.31127376, 0.65861938, 10.4251746 ])
In [42]:
total_outliers = np.sum((chemical_cols > (data_mean + 3*data_std)) | (chemical_cols < (data_mean - 3*data_std)))
total_outliers
Out[42]:
np.int64(182)
In [81]:
outlier_mask = (chemical_cols > (data_mean + 3*data_std)) | (chemical_cols < (data_mean - 3*data_std))
outlier_mask
Out[81]:
array([[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False]], shape=(1592, 11))
In [133]:
np.sum(outlier_mask)
Out[133]:
np.int64(182)
In [ ]:
#####################################
In [50]:
data_means = np.mean(cleaned_data[:, :-1], axis=0) # exclude the last column
data_stds = np.std(cleaned_data[:, :-1], axis=0)
threshold = 3.0
outlier_mask = np.zeros_like(cleaned_data[:, :-1], dtype=bool)
for col_idx in range(cleaned_data.shape[1] - 1):
col_data = cleaned_data[:, col_idx]
mean_val = data_means[col_idx]
std_val = data_stds[col_idx]
lower_bound = mean_val - threshold * std_val
upper_bound = mean_val + threshold * std_val
outlier_mask[:, col_idx] = (col_data < lower_bound) | (col_data > upper_bound)
total_outliers = np.sum(outlier_mask)
print("Total outliers detected:", total_outliers)
outlier_mask
Total outliers detected: 182
Out[50]:
array([[False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], ..., [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False], [False, False, False, ..., False, False, False]], shape=(1592, 11))
Activity 5: Fine-Tuning the Blend: Handling Outliers¶
In [113]:
data_copy = cleaned_data[:,:-1].copy()
data_copy[outlier_mask] = np.nan
In [114]:
np.sum(np.isnan(data_copy))
Out[114]:
np.int64(182)
In [115]:
data_mean = np.nanmean(data_copy, axis = 0)
inds = np.where(np.isnan(data_copy))
data_copy[inds] = np.take(data_mean, inds[1])
final_data = np.column_stack((data_copy, cleaned_data[:,-1]))
final_data.shape
Out[115]:
(1592, 12)
Activity 6: Harmonizing the Blend: Feature Scaling¶
In [128]:
final_data_copy = final_data.copy()
In [129]:
max_col = np.nanmax(final_data_copy[:, :-1], axis = 0)
min_col = np.nanmin(final_data_copy[:, :-1], axis = 0)
max_col
Out[129]:
array([ 13.5 , 1.04 , 0.79 , 6.7 , 0.226 , 47. , 144. , 1.0022, 3.75 , 1.16 , 13.6 ])
In [132]:
#(value - min) / (max - min)
scaled_data = (final_data_copy[:, :-1] - min_col) / (max_col - min_col)
scaled_data = np.column_stack((scaled_data, final_data_copy[:,-1]))
scaled_data[:5]
Out[132]:
array([[0.31460674, 0.63043478, 0. , 0.17241379, 0.29906542, 0.2173913 , 0.20289855, 0.6 , 0.73033708, 0.27710843, 0.19230769, 5. ], [0.35955056, 0.82608696, 0. , 0.29310345, 0.40186916, 0.52173913, 0.44202899, 0.50909091, 0.38202247, 0.42168675, 0.26923077, 5. ], [0.35955056, 0.69565217, 0.05063291, 0.24137931, 0.37383178, 0.30434783, 0.34782609, 0.52727273, 0.4494382 , 0.38554217, 0.26923077, 5. ], [0.74157303, 0.17391304, 0.70886076, 0.17241379, 0.29439252, 0.34782609, 0.39130435, 0.61818182, 0.33707865, 0.30120482, 0.26923077, 6. ], [0.31460674, 0.63043478, 0. , 0.17241379, 0.29906542, 0.2173913 , 0.20289855, 0.6 , 0.73033708, 0.27710843, 0.19230769, 5. ]])
In [131]:
##################################################
scaled_data = final_data.copy()
for col_idx in range(scaled_data.shape[1] - 1):
col_vals = scaled_data[:, col_idx]
col_min = np.min(col_vals)
col_max = np.max(col_vals)
scaled_col = (col_vals - col_min) / (col_max - col_min)
scaled_data[:, col_idx] = scaled_col
print("First 5 rows of scaled data:\n", scaled_data[:5])
First 5 rows of scaled data: [[0.31460674 0.63043478 0. 0.17241379 0.29906542 0.2173913 0.20289855 0.6 0.73033708 0.27710843 0.19230769 5. ] [0.35955056 0.82608696 0. 0.29310345 0.40186916 0.52173913 0.44202899 0.50909091 0.38202247 0.42168675 0.26923077 5. ] [0.35955056 0.69565217 0.05063291 0.24137931 0.37383178 0.30434783 0.34782609 0.52727273 0.4494382 0.38554217 0.26923077 5. ] [0.74157303 0.17391304 0.70886076 0.17241379 0.29439252 0.34782609 0.39130435 0.61818182 0.33707865 0.30120482 0.26923077 6. ] [0.31460674 0.63043478 0. 0.17241379 0.29906542 0.2173913 0.20289855 0.6 0.73033708 0.27710843 0.19230769 5. ]]
ML Model Training(Optional)¶
In [134]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
In [135]:
# 1. Split features and target
X = scaled_data[:, :-1]
y = scaled_data[:, -1].astype(int)
In [136]:
# 2. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [137]:
# 3. Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
Out[137]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [138]:
# 4. Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy:", accuracy)
Model accuracy: 0.6896551724137931
In [ ]: