=============================================================================== MACHINE LEARNING PRACTICALS - JOURNAL =============================================================================== =============================================================================== PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING (Handling missing values, Encoding categorical variables, Scaling features) =============================================================================== ------- 1.1: ENCODING (encoding.py) ------- CODE: import pandas as pd from sklearn.preprocessing import OrdinalEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder df=pd.read_csv("customer.csv") print(df) df1=df.iloc[:,2:] print(df1) x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1) print("XTrain: \n",x_train) print("Ytrain: \n",y_train) print("XTEST: \n",x_test) print("YTEST: \n",y_test) #ordinal Encoding oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']]) oe.fit(x_train) x_train=oe.transform(x_train) x_test=oe.transform(x_test) print(x_train) #Label Encoder le=LabelEncoder() le.fit(y_train) y_train=le.transform(y_train) y_test=le.transform(y_test) print(y_train) #Onehot Encoding using pandas df2=df.iloc[:,1:2] encod=OneHotEncoder(sparse_output=False) encoded=encod.fit_transform(df2) print("Feature Names:") print(encod.get_feature_names_out()) print(encoded) OUTPUT: age Gender review education Purchase 0 NaN Male Good HSC yes 1 48.0 Male Good PG no 2 68.0 Female Average UG no 3 77.0 Female Average PG yes 4 26.0 Male Poor PG yes ... [14 rows x 5 columns] review education Purchase 0 Good HSC yes 1 Good PG no 2 Average UG no 3 Average PG yes ... [14 rows x 3 columns] XTrain: review education 3 Average PG 8 Good UG 6 Good PG 2 Average UG ... [12 rows x 2 columns] Ytrain: 3 yes 8 yes 6 yes 2 no ... Name: Purchase, dtype: object XTEST: review education 5 Good UG 13 Good UG YTEST: 5 no 13 yes Name: Purchase, dtype: object [[1. 2.] [2. 1.] [2. 2.] [1. 1.] ... [0. 2.]] [1 1 1 0 0 0 0 1 1 1 1 1] Feature Names: ['Gender_Female' 'Gender_Male'] [[0. 1.] [0. 1.] [1. 0.] [1. 0.] ... [1. 0.]] ------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) ------- CODE: import matplotlib.pyplot as plt import numpy as np arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110]) arr1=np.sort(arr) print(arr1) mean=np.mean(arr) print("MEAN=",mean) median=np.median(arr) print("MEDIAN=",median) q1=np.percentile(arr,25) print("Quarter 1=",q1) q3=np.percentile(arr1,75) print("Quarter 3=",q3) plt.boxplot(arr) plt.show() plt.hist(arr) plt.show() OUTPUT: [100 110 110 110 120 120 130 140 140 150 170 220] MEAN= 135.0 MEDIAN= 125.0 Quarter 1= 110.0 Quarter 3= 142.5 ------- 1.3: CORRELATION WITH TARGET (corela_target.py) ------- CODE: import pandas as pd data = { 'sqft': [1500, 1600, 1700, 1800, 1900], 'rooms': [3, 3, 4, 4, 5], 'roof_color': [1, 2, 1, 2, 1], 'price': [300000, 320000, 340000, 360000, 380000] } df = pd.DataFrame(data) correlation_matrix = df.corr(numeric_only=True) print("🔁 Full Correlation Matrix:") print(correlation_matrix.round(2)) correlation = df.corr()['price'].drop('price') print(correlation) selected_features = correlation[correlation.abs() > 0.3].index print("Selected features:", list(selected_features)) OUTPUT: 🔁 Full Correlation Matrix: sqft rooms roof_color price sqft 1.00 0.94 0.00 1.00 rooms 0.94 1.00 -0.33 0.94 roof_color 0.00 -0.33 1.00 0.00 price 1.00 0.94 0.00 1.00 sqft 1.000000e+00 rooms 9.449112e-01 roof_color 5.250970e-17 Name: price, dtype: float64 Selected features: ['sqft', 'rooms'] ------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) ------- CODE: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer df = pd.read_csv("customer.csv") print(df) x=df.iloc[:,:4] y=df.iloc[:,-1] x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1) trans = ColumnTransformer( transformers=[ ('impute_age', SimpleImputer(), ['age']), ('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']), ('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']), ('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education']) ], remainder='passthrough' ) x_train = trans.fit_transform(x_train) x_test =trans.fit_transform(x_test) print("\nTransformed XTrain:\n", x_train) print("\nTransformed XTest:\n", x_test) le=LabelEncoder() y_train1=le.fit_transform(y_train) y_test1=le.fit_transform(y_test) print("\nTransformed YTrain:\n", y_train1) print("\nTransformed YTest:\n", y_test1) OUTPUT: age Gender review education Purchase 0 NaN Male Good HSC yes 1 48.0 Male Good PG no 2 68.0 Female Average UG no 3 77.0 Female Average PG yes 4 26.0 Male Poor PG yes ... [14 rows x 5 columns] Transformed XTrain: [[55. 0. 1. 0. 2.] [18. 0. 1. 2. 1.] [44. 0. 1. 2. 1.] [50. 1. 0. 2. 1.] ... [26. 0. 1. 0. 2.]] Transformed XTest: [[77. 1. 2. 0.] [77. 1. 1. 2.]] Transformed YTrain: [1 0 1 1 1 0 1 0 0 0 1 1] Transformed YTest: [0 0] ------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) ------- CODE: import pandas as pd data = { 'sqft': [1500, 1600, 1700, 1800, 1900], 'rooms': [3, 3, 4, 4, 5], 'bathrooms': [1, 2, 2, 2, 3], 'roof_color': [1, 2, 1, 2, 1], 'price': [300000, 320000, 340000, 360000, 380000] } df = pd.DataFrame(data) feature_corr = df.drop(columns='price').corr() print("Correlation between features:") print(feature_corr.round(2)) OUTPUT: Correlation between features: sqft rooms bathrooms roof_color sqft 1.00 0.94 0.89 0.00 rooms 0.94 1.00 0.85 -0.33 bathrooms 0.89 0.85 1.00 0.00 roof_color 0.00 -0.33 0.00 1.00 =============================================================================== PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA) (Dimensionality Reduction while retaining maximum variance) =============================================================================== CODE: import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler df=pd.read_csv("student_dataset.csv") print(df) scaler=StandardScaler() df1=scaler.fit_transform(df.iloc[:,:3]) print(df1) cov_matrix = np.cov(df1.T) print("COVARIANCE MATRIX:\n", cov_matrix) eig_val,eig_vect=np.linalg.eig(cov_matrix) print("\nEigen Values\n",eig_val) print("Eigen Vectors\n",eig_vect) pc = eig_vect[:,[0, 2]] pc=pc.T print("\nTop 2 Principal Components:\n", pc) trans_df = np.dot(df1[:,0:3], pc.T) print(" \nNew Transform\n",trans_df) Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2']) Dataf['GTU Marks']=df['GTU'].values print(Dataf) OUTPUT: Mid_Sem IQ HSC GTU 0 35 110 78 70 1 42 125 85 88 2 28 100 72 65 3 45 130 90 92 4 38 115 80 78 ... [15 rows x 4 columns] [[-0.09736702 -0.20785572 -0.20441405] [ 1.03858157 1.20934235 0.81765621] [-1.23331562 -1.15265443 -1.08047428] [ 1.52541669 1.68174171 1.5477064 ] ... [-0.74648051 -0.96369469 -0.93446424]] COVARIANCE MATRIX: [[1.07142857 1.0614152 1.05676449] [1.0614152 1.07142857 1.05019437] [1.05676449 1.05019437 1.07142857]] Eigen Values [3.18368463 0.00878971 0.02181137] Eigen Vectors [[-0.57842869 -0.7974863 -0.17156877] [-0.57723546 0.54876897 -0.60469152] [-0.57638483 0.25073535 0.77776109]] Top 2 Principal Components: [[-0.57842869 -0.57723546 -0.57638483] [-0.17156877 -0.60469152 0.77776109]] New Transform [[ 0.29412273 -0.01659157] [-1.77010531 -0.27352604] [ 2.00150714 0.06824795] [-2.74518022 -0.074903 ] ... [ 1.5266755 -0.01597918]] PC1 PC2 GTU Marks 0 0.294123 -0.016592 70 1 -1.770105 -0.273526 88 2 2.001507 0.068248 65 3 -2.745180 -0.074903 92 4 -0.428478 -0.158651 78 ... [15 rows x 3 columns] =============================================================================== PRACTICAL 3: DECISION TREE CLASSIFIER (Classification with evaluation using precision, recall, and F1-score) =============================================================================== CODE: import pandas as pd from sklearn.metrics import confusion_matrix from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report data = pd.read_csv("decesiontree.csv") print(data) cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2}, "Income": {"Low": 0, "Medium": 1, "High" : 2 }, "Student": {"No": 0, "Yes":1 }, "Credit Rating": { "Fair": 1, "Excellent" : 2 }, "Buys-Computer": {"No": 0, "Yes": 1}} data.replace(cleanup_nums, inplace = True) print(data) predictors = data.iloc[:, 1:5] target = data.iloc[:, 5] dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100, max_depth=3,min_samples_leaf=5) OUTPUT: Item no Age Income Student Credit Rating Buys-Computer 0 1 Youth High No Fair No 1 2 Youth High No Excellent No 2 3 Middle High No Fair Yes 3 4 Senior Medium No Fair Yes 4 5 Senior Low Yes Fair Yes ... [14 rows x 6 columns] Item no Age Income Student Credit Rating Buys-Computer 0 1 0 2 0 1 0 1 2 0 2 0 2 0 2 3 1 2 0 1 1 3 4 2 1 0 1 1 4 5 2 0 1 1 1 ... [14 rows x 6 columns] =============================================================================== PRACTICAL 4: NAIVE BAYES CLASSIFIER (Probabilistic classification using Gaussian Naive Bayes) =============================================================================== CODE: import pandas as pd from sklearn import preprocessing from sklearn.naive_bayes import GaussianNB fl = "Naive_Bayesian.csv" df = pd.read_csv(fl, index_col = "Item no") print (df) dfCol = df.columns print ("df columns: ", dfCol) ndfCol = df.shape[1] ndfRow = df.shape[0] feature = [[]*ndfRow for x in range(ndfCol)] for i in range(ndfCol): feature[i] = list(df[dfCol[i]]) print (dfCol[i],":", feature[i]) le = preprocessing.LabelEncoder() feature0 = [[]*ndfRow for x in range(ndfCol)] for i in range(ndfCol): feature0[i] = le.fit_transform(feature[i]) print(dfCol[i], "encoded:", feature0[i]) features = [] for i in range(ndfRow): xlst = [] for j in range(ndfCol-1): xlst.append(feature0[j][i]) xtup = tuple(xlst) features.append(xtup) print ("features:", features) label = feature0[:][ndfCol-1] label = [label[i]+1 for i in range(ndfRow)] print ("label:", label) model = GaussianNB() model.fit(features, label) print ("model:", model) ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ") ptLst = [int(x) for x in ptStr.split(',')] point1 = [ptLst] print ("Unknown data (sample):", point1) predicted= model.predict(point1) print ("Class for Point:", point1, "is:", predicted) OUTPUT (with input: 0,1,1,0): Age Income Student Credit Rating Buys-Computer Item no 1 Youth High No Fair No 2 Youth High No Excellent No 3 Middle High No Fair Yes 4 Senior Medium No Fair Yes ... [14 rows x 5 columns] df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object') Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...] Income : ['High', 'High', 'High', 'Medium', 'Low', ...] Student : ['No', 'No', 'No', 'No', 'Yes', ...] Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...] Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...] Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1] Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2] Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1] Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0] Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0] features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1), (1, 1, 1, 1), (0, 1, 1, 0), ...] label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1] model: GaussianNB() Enter unknown data (separated by ,) excluding Index Column: Unknown data (sample): [[0, 1, 1, 0]] Class for Point: [[0, 1, 1, 0]] is: [2] =============================================================================== PRACTICAL 5: LINEAR REGRESSION (Predicting continuous values with evaluation using MSE and R² score) =============================================================================== CODE: import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import numpy as np from sklearn import metrics dataset=pd.read_csv("LinearRegression.csv") print(dataset) x=dataset.iloc[:,0:1] y=dataset.iloc[:,1] y=y.replace(['Yes','No'],[1,0]) print(y) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123) model = LinearRegression() model = model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_val=model.predict([[18]]) print(y_pred_val) if(y_pred_val > 0.5): print("Yes") else: print("No") plt.scatter(X_train,y_train, color = 'red') plt.plot(X_train, model.predict(X_train)) print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) OUTPUT: Outside Temperature \nCelcius Wear a\n jacket 0 30 No 1 25 No 2 20 No 3 15 Yes 4 10 Yes 0 0 1 0 2 0 3 1 4 1 Name: Wear a\n jacket, dtype: int64 [0.54285714] Yes Mean Absolute Error: 0.14285714285714302 Mean Squared Error: 0.02040816326530617 Root Mean Squared Error: 0.14285714285714302 =============================================================================== PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER (Classification using different k values with accuracy evaluation) =============================================================================== CODE: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, classification_report df = pd.read_csv("knn.csv") df = df[df['Item no.'].notna()] print("Dataset Preview:") print(df.head()) X = df.iloc[:, 1:4] y = df.iloc[:, 4] print("INPUT\n",X) print("OUTPUT\n",y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) print("XTRAIN\n",X_train) print("X_TEST\n",X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print("PREDICTION : \n",y_pred) print("\nAccuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:") print(classification_report(y_test, y_pred)) OUTPUT: Dataset Preview: Item no. Temp Humidity Wind Speed Play ... 0 1.0 85.0 85.0 12.0 No ... 1 2.0 80.0 90.0 9.0 No ... 2 3.0 83.0 86.0 4.0 Yes ... 3 4.0 70.0 96.0 3.0 Yes ... 4 5.0 68.0 80.0 5.0 Yes ... INPUT Temp Humidity Wind Speed 0 85.0 85.0 12.0 1 80.0 90.0 9.0 2 83.0 86.0 4.0 3 70.0 96.0 3.0 4 68.0 80.0 5.0 ... [14 rows x 3 columns] OUTPUT 0 No 1 No 2 Yes 3 Yes 4 Yes ... Name: Play, dtype: object XTRAIN [[ 1.37690922 -0.53048047 -0.46006855] [-1.22885447 -0.99359834 2.25104967] [-0.57741354 -0.99359834 -0.46006855] [ 1.70262968 0.48837885 -0.64080976] ... [-1.3917147 -1.45671621 -1.00229219]] X_TEST [[ 0.39974784 -0.0673626 -1.00229219] [-0.08883285 0.85887314 -0.64080976] [ 2.02835014 0.39575527 0.80511996]] PREDICTION : ['Yes' 'Yes' 'Yes'] Accuracy: 0.6666666666666666 Classification Report: precision recall f1-score support No 0.00 0.00 0.00 1 Yes 0.67 1.00 0.80 2 accuracy 0.67 3 macro avg 0.33 0.50 0.40 3 weighted avg 0.44 0.67 0.53 3 =============================================================================== PRACTICAL 7: MULTIPLE LINEAR REGRESSION (Prediction using multiple features with R² score and RMSE evaluation) =============================================================================== CODE: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score Data=pd.read_excel("student_data1.xlsx") print(Data) X=Data.iloc[:,:2] y=Data.iloc[:,-1:] print(X) print(y) X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42) print("Xtrain\n",X_train) print("XTEST\n",y_test) model = LinearRegression() model.fit(X_train.to_numpy(), y_train) y_pred = model.predict([[8.6,125]]) print("model prediction on Ytest:\n",y_pred.round(2)) print("M= ",model.coef_.round(2)) print("b= ",model.intercept_.round(2)) OUTPUT: CGPA IQ Placement (LPA) 0 7.5 110 6.5 1 8.0 120 7.0 2 8.5 125 8.2 3 9.0 130 9.1 4 6.5 100 5.0 ... [10 rows x 3 columns] CGPA IQ 0 7.5 110 1 8.0 120 2 8.5 125 3 9.0 130 4 6.5 100 ... [10 rows x 2 columns] Placement (LPA) 0 6.5 1 7.0 2 8.2 3 9.1 4 5.0 ... [10 rows x 1 columns] Xtrain CGPA IQ 5 7.0 105 0 7.5 110 7 8.8 128 2 8.5 125 ... [8 rows x 2 columns] XTEST Placement (LPA) 8 5.2 1 7.0 model prediction on Ytest: [[8.45]] M= [[1.32 0.03]] b= [-6.51] =============================================================================== PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD) (Dimensionality Reduction using SVD - Manual & Sklearn Implementation) =============================================================================== ------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) ------- CODE: import pandas as pd import numpy as np df = pd.read_excel("student_dataset.xlsx") A = df.iloc[:, :3].to_numpy() A_mean = A - np.mean(A, axis=0) U, X, V_T = np.linalg.svd(A_mean) k = 2 U_k = U[:, :k] S_k = np.diag(X[:k]) final_data1 = np.dot(U_k, S_k) print("Reduced Data:\n", final_data1) explained_variance = (X[:k]**2) / np.sum(X**2) print("Explained variance by top 2 components:", explained_variance) reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"]) reduced_df['GTU'] = df['GTU'].values print(reduced_df) OUTPUT: Reduced Data: [[ -2.60622042 0.08983428] [ 15.20533711 -2.22651162] [-16.15266994 0.28962606] [ 22.72992624 -0.77843972] [ 3.46193108 -0.87192496] ... [-12.83777493 0.27648847]] Explained variance by top 2 components: [0.99132896 0.00672569] PC1 PC2 GTU 0 -2.606220 0.089834 70 1 15.205337 -2.226512 88 2 -16.152670 0.289626 65 3 22.729926 -0.778440 92 4 3.461931 -0.871925 78 ... [15 rows x 3 columns] ------- 8.2: SVD USING SKLEARN (svd2.py) ------- CODE: import pandas as pd import numpy as np from sklearn.decomposition import TruncatedSVD from sklearn.preprocessing import StandardScaler df = pd.read_excel("student_dataset.xlsx") X = df.iloc[:, :3] scaler = StandardScaler() X_scaled = scaler.fit_transform(X) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_scaled) print(X_reduced) Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2']) Dataf['GTU Marks']=df['GTU'].values print(Dataf) print("Singular values:", svd.singular_values_) print("Explained variance:", svd.explained_variance_) print("Explained variance ratio:", svd.explained_variance_ratio_) print("Total variance captured:", svd.explained_variance_ratio_.sum()) OUTPUT: [[-0.29412273 -0.01659157] [ 1.77010531 -0.27352604] [-2.00150714 0.06824795] [ 2.74518022 -0.074903 ] [ 0.42847827 -0.1586513 ] ... [-1.5266755 -0.01597918]] PC1 PC2 GTU Marks 0 -0.294123 -0.016592 70 1 1.770105 -0.273526 88 2 -2.001507 0.068248 65 3 2.745180 -0.074903 92 4 0.428478 -0.158651 78 ... [15 rows x 3 columns] Singular values: [6.67619539 0.55259316] Explained variance: [2.97143899 0.02035728] Explained variance ratio: [0.99047966 0.00678576] Total variance captured: 0.997265423131314 =============================================================================== END OF JOURNAL ===============================================================================