Statement of Completion#18a0ceb1
Numeric programming with Numpy
medium
From Grape to Glass: A NumPy Journey in Wine Data Cleaning
Resolution
Activities
Project.ipynb
Import Libraries¶
In [1]:
import numpy as np
print("Welcome to VinoVeritas Data Cleaning Project!")
Welcome to VinoVeritas Data Cleaning Project!
Activities¶
Activity 1: Uncorking the Data: Loading from CSV¶
In [5]:
wine_data = np.genfromtxt("winequality-red.csv",delimiter=';',skip_header=1)
Activity 2: Searching for Ghosts: Locating the NaNs¶
In [13]:
nan_mask=np.isnan(wine_data)
total_nans=np.sum(nan_mask)
num_rows_with_nan=np.sum(np.any(nan_mask,axis=1))
print("The total number of NaNs in the data",total_nans)
print("Number of rows with atleast one NaN",num_rows_with_nan)
The total number of NaNs in the data 191 Number of rows with atleast one NaN 184
Activity 3: Patch Up the Barrels: Imputing Missing Data¶
In [19]:
nans_per_row=np.sum(nan_mask,axis=1)
keep_rows_mask=nans_per_row<2
filtered_data=wine_data[keep_rows_mask]
filtered_data_mask=nan_mask[keep_rows_mask]
col_means=np.nanmean(filtered_data,axis=0)
cleaned_data=np.copy(filtered_data)
inds=np.where(np.isnan(cleaned_data))
cleaned_data[inds]=np.take(col_means,inds[1])
print("Original data shape",wine_data.shape)
print("Cleaned data shape",cleaned_data.shape)
Original data shape (1599, 12) Cleaned data shape (1592, 12)
Activity 4: Hunting for Outliers: Spotting Extreme Values¶
In [26]:
chemical_data=cleaned_data[:,:-1]
col_mean=np.mean(chemical_data,axis=0)
col_std=np.std(chemical_data,axis=0)
lower_bound=col_mean-3*col_std
upper_bound=col_mean+3*col_std
outlier_mask=(chemical_data<lower_bound)|(chemical_data>upper_bound)
total_outliers=np.sum(outlier_mask)
total_outliers
Out[26]:
np.int64(182)
Activity 5: Fine-Tuning the Blend: Handling Outliers¶
In [52]:
# 1. Copy the cleaned_data
data_with_outliers_masked = cleaned_data.copy()
# 2. Replace outliers with NaN
for col_idx in range(data_with_outliers_masked.shape[1] - 1):
data_with_outliers_masked[outlier_mask[:, col_idx], col_idx] = np.nan
# 3. Impute these new NaNs with the column mean
col_means_post_outlier = np.nanmean(data_with_outliers_masked, axis=0)
def fill_nan_with_col_mean(data, col_means):
data_copy = data.copy()
mask = np.isnan(data_copy)
for c in range(data_copy.shape[1]):
data_copy[mask[:, c], c] = col_means[c]
return data_copy
final_data = fill_nan_with_col_mean(data_with_outliers_masked, col_means_post_outlier)
print("Shape of final_data:", final_data.shape)
print("Any NaNs remaining?", np.isnan(final_data).any())
Shape of final_data: (1592, 12) Any NaNs remaining? False
Activity 6: Harmonizing the Blend: Feature Scaling¶
In [56]:
scaled_data=final_data.copy()
for col_idx in range(scaled_data.shape[1] -1):
col_vals = scaled_data[:,col_idx]
col_min=np.min(col_vals)
col_max=np.max(col_vals)
scaled_col=(col_vals-col_min)/(col_max-col_min)
scaled_data[:,col_idx]=scaled_col
print("First 5 rows of scaled data:\n", scaled_data[:5])
First 5 rows of scaled data: [[0.31460674 0.63043478 0. 0.17241379 0.29906542 0.2173913 0.20289855 0.6 0.73033708 0.27710843 0.19230769 5. ] [0.35955056 0.82608696 0. 0.29310345 0.40186916 0.52173913 0.44202899 0.50909091 0.38202247 0.42168675 0.26923077 5. ] [0.35955056 0.69565217 0.05063291 0.24137931 0.37383178 0.30434783 0.34782609 0.52727273 0.4494382 0.38554217 0.26923077 5. ] [0.74157303 0.17391304 0.70886076 0.17241379 0.29439252 0.34782609 0.39130435 0.61818182 0.33707865 0.30120482 0.26923077 6. ] [0.31460674 0.63043478 0. 0.17241379 0.29906542 0.2173913 0.20289855 0.6 0.73033708 0.27710843 0.19230769 5. ]]
ML Model Training(Optional)¶
In [58]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
In [59]:
# 1. Split features and target
X = scaled_data[:, :-1]
y = scaled_data[:, -1].astype(int)
In [60]:
# 2. Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [61]:
# 3. Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
Out[61]:
RandomForestClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=42)
In [62]:
# 4. Evaluate the model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model accuracy:", accuracy)
Model accuracy: 0.6896551724137931
In [ ]: