practicals.txt - Opengist

Revisão bd74f766bcc7588c5fcf01a25752dc1d72bc8f99

practicals.txt · 22 KiB · Text Bruto

=============================================================================== MACHINE LEARNING PRACTICALS - JOURNAL =============================================================================== =============================================================================== PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING (Handling missing values, Encoding categorical variables, Scaling features) =============================================================================== ------- 1.1: ENCODING (encoding.py) ------- CODE: import pandas as pd from sklearn.preprocessing import OrdinalEncoder from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import OneHotEncoder df=pd.read_csv("customer.csv") print(df) df1=df.iloc[:,2:] print(df1) x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1) print("XTrain: \n",x_train) print("Ytrain: \n",y_train) print("XTEST: \n",x_test) print("YTEST: \n",y_test) #ordinal Encoding oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']]) oe.fit(x_train) x_train=oe.transform(x_train) x_test=oe.transform(x_test) print(x_train) #Label Encoder le=LabelEncoder() le.fit(y_train) y_train=le.transform(y_train) y_test=le.transform(y_test) print(y_train) #Onehot Encoding using pandas df2=df.iloc[:,1:2] encod=OneHotEncoder(sparse_output=False) encoded=encod.fit_transform(df2) print("Feature Names:") print(encod.get_feature_names_out()) print(encoded) OUTPUT: age Gender review education Purchase 0 NaN Male Good HSC yes 1 48.0 Male Good PG no 2 68.0 Female Average UG no 3 77.0 Female Average PG yes 4 26.0 Male Poor PG yes ... [14 rows x 5 columns] review education Purchase 0 Good HSC yes 1 Good PG no 2 Average UG no 3 Average PG yes ... [14 rows x 3 columns] XTrain: review education 3 Average PG 8 Good UG 6 Good PG 2 Average UG ... [12 rows x 2 columns] Ytrain: 3 yes 8 yes 6 yes 2 no ... Name: Purchase, dtype: object XTEST: review education 5 Good UG 13 Good UG YTEST: 5 no 13 yes Name: Purchase, dtype: object [[1. 2.] [2. 1.] [2. 2.] [1. 1.] ... [0. 2.]] [1 1 1 0 0 0 0 1 1 1 1 1] Feature Names: ['Gender_Female' 'Gender_Male'] [[0. 1.] [0. 1.] [1. 0.] [1. 0.] ... [1. 0.]] ------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) ------- CODE: import matplotlib.pyplot as plt import numpy as np arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110]) arr1=np.sort(arr) print(arr1) mean=np.mean(arr) print("MEAN=",mean) median=np.median(arr) print("MEDIAN=",median) q1=np.percentile(arr,25) print("Quarter 1=",q1) q3=np.percentile(arr1,75) print("Quarter 3=",q3) plt.boxplot(arr) plt.show() plt.hist(arr) plt.show() OUTPUT: [100 110 110 110 120 120 130 140 140 150 170 220] MEAN= 135.0 MEDIAN= 125.0 Quarter 1= 110.0 Quarter 3= 142.5 ------- 1.3: CORRELATION WITH TARGET (corela_target.py) ------- CODE: import pandas as pd data = { 'sqft': [1500, 1600, 1700, 1800, 1900], 'rooms': [3, 3, 4, 4, 5], 'roof_color': [1, 2, 1, 2, 1], 'price': [300000, 320000, 340000, 360000, 380000] } df = pd.DataFrame(data) correlation_matrix = df.corr(numeric_only=True) print("🔁 Full Correlation Matrix:") print(correlation_matrix.round(2)) correlation = df.corr()['price'].drop('price') print(correlation) selected_features = correlation[correlation.abs() > 0.3].index print("Selected features:", list(selected_features)) OUTPUT: 🔁 Full Correlation Matrix: sqft rooms roof_color price sqft 1.00 0.94 0.00 1.00 rooms 0.94 1.00 -0.33 0.94 roof_color 0.00 -0.33 1.00 0.00 price 1.00 0.94 0.00 1.00 sqft 1.000000e+00 rooms 9.449112e-01 roof_color 5.250970e-17 Name: price, dtype: float64 Selected features: ['sqft', 'rooms'] ------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) ------- CODE: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder from sklearn.compose import ColumnTransformer from sklearn.impute import SimpleImputer df = pd.read_csv("customer.csv") print(df) x=df.iloc[:,:4] y=df.iloc[:,-1] x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1) trans = ColumnTransformer( transformers=[ ('impute_age', SimpleImputer(), ['age']), ('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']), ('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']), ('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education']) ], remainder='passthrough' ) x_train = trans.fit_transform(x_train) x_test =trans.fit_transform(x_test) print("\nTransformed XTrain:\n", x_train) print("\nTransformed XTest:\n", x_test) le=LabelEncoder() y_train1=le.fit_transform(y_train) y_test1=le.fit_transform(y_test) print("\nTransformed YTrain:\n", y_train1) print("\nTransformed YTest:\n", y_test1) OUTPUT: age Gender review education Purchase 0 NaN Male Good HSC yes 1 48.0 Male Good PG no 2 68.0 Female Average UG no 3 77.0 Female Average PG yes 4 26.0 Male Poor PG yes ... [14 rows x 5 columns] Transformed XTrain: [[55. 0. 1. 0. 2.] [18. 0. 1. 2. 1.] [44. 0. 1. 2. 1.] [50. 1. 0. 2. 1.] ... [26. 0. 1. 0. 2.]] Transformed XTest: [[77. 1. 2. 0.] [77. 1. 1. 2.]] Transformed YTrain: [1 0 1 1 1 0 1 0 0 0 1 1] Transformed YTest: [0 0] ------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) ------- CODE: import pandas as pd data = { 'sqft': [1500, 1600, 1700, 1800, 1900], 'rooms': [3, 3, 4, 4, 5], 'bathrooms': [1, 2, 2, 2, 3], 'roof_color': [1, 2, 1, 2, 1], 'price': [300000, 320000, 340000, 360000, 380000] } df = pd.DataFrame(data) feature_corr = df.drop(columns='price').corr() print("Correlation between features:") print(feature_corr.round(2)) OUTPUT: Correlation between features: sqft rooms bathrooms roof_color sqft 1.00 0.94 0.89 0.00 rooms 0.94 1.00 0.85 -0.33 bathrooms 0.89 0.85 1.00 0.00 roof_color 0.00 -0.33 0.00 1.00 =============================================================================== PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA) (Dimensionality Reduction while retaining maximum variance) =============================================================================== CODE: import pandas as pd import numpy as np from sklearn.preprocessing import StandardScaler df=pd.read_csv("student_dataset.csv") print(df) scaler=StandardScaler() df1=scaler.fit_transform(df.iloc[:,:3]) print(df1) cov_matrix = np.cov(df1.T) print("COVARIANCE MATRIX:\n", cov_matrix) eig_val,eig_vect=np.linalg.eig(cov_matrix) print("\nEigen Values\n",eig_val) print("Eigen Vectors\n",eig_vect) pc = eig_vect[:,[0, 2]] pc=pc.T print("\nTop 2 Principal Components:\n", pc) trans_df = np.dot(df1[:,0:3], pc.T) print(" \nNew Transform\n",trans_df) Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2']) Dataf['GTU Marks']=df['GTU'].values print(Dataf) OUTPUT: Mid_Sem IQ HSC GTU 0 35 110 78 70 1 42 125 85 88 2 28 100 72 65 3 45 130 90 92 4 38 115 80 78 ... [15 rows x 4 columns] [[-0.09736702 -0.20785572 -0.20441405] [ 1.03858157 1.20934235 0.81765621] [-1.23331562 -1.15265443 -1.08047428] [ 1.52541669 1.68174171 1.5477064 ] ... [-0.74648051 -0.96369469 -0.93446424]] COVARIANCE MATRIX: [[1.07142857 1.0614152 1.05676449] [1.0614152 1.07142857 1.05019437] [1.05676449 1.05019437 1.07142857]] Eigen Values [3.18368463 0.00878971 0.02181137] Eigen Vectors [[-0.57842869 -0.7974863 -0.17156877] [-0.57723546 0.54876897 -0.60469152] [-0.57638483 0.25073535 0.77776109]] Top 2 Principal Components: [[-0.57842869 -0.57723546 -0.57638483] [-0.17156877 -0.60469152 0.77776109]] New Transform [[ 0.29412273 -0.01659157] [-1.77010531 -0.27352604] [ 2.00150714 0.06824795] [-2.74518022 -0.074903 ] ... [ 1.5266755 -0.01597918]] PC1 PC2 GTU Marks 0 0.294123 -0.016592 70 1 -1.770105 -0.273526 88 2 2.001507 0.068248 65 3 -2.745180 -0.074903 92 4 -0.428478 -0.158651 78 ... [15 rows x 3 columns] =============================================================================== PRACTICAL 3: DECISION TREE CLASSIFIER (Classification with evaluation using precision, recall, and F1-score) =============================================================================== CODE: import pandas as pd from sklearn.metrics import confusion_matrix from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report data = pd.read_csv("decesiontree.csv") print(data) cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2}, "Income": {"Low": 0, "Medium": 1, "High" : 2 }, "Student": {"No": 0, "Yes":1 }, "Credit Rating": { "Fair": 1, "Excellent" : 2 }, "Buys-Computer": {"No": 0, "Yes": 1}} data.replace(cleanup_nums, inplace = True) print(data) predictors = data.iloc[:, 1:5] target = data.iloc[:, 5] dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100, max_depth=3,min_samples_leaf=5) OUTPUT: Item no Age Income Student Credit Rating Buys-Computer 0 1 Youth High No Fair No 1 2 Youth High No Excellent No 2 3 Middle High No Fair Yes 3 4 Senior Medium No Fair Yes 4 5 Senior Low Yes Fair Yes ... [14 rows x 6 columns] Item no Age Income Student Credit Rating Buys-Computer 0 1 0 2 0 1 0 1 2 0 2 0 2 0 2 3 1 2 0 1 1 3 4 2 1 0 1 1 4 5 2 0 1 1 1 ... [14 rows x 6 columns] =============================================================================== PRACTICAL 4: NAIVE BAYES CLASSIFIER (Probabilistic classification using Gaussian Naive Bayes) =============================================================================== CODE: import pandas as pd from sklearn import preprocessing from sklearn.naive_bayes import GaussianNB fl = "Naive_Bayesian.csv" df = pd.read_csv(fl, index_col = "Item no") print (df) dfCol = df.columns print ("df columns: ", dfCol) ndfCol = df.shape[1] ndfRow = df.shape[0] feature = [[]*ndfRow for x in range(ndfCol)] for i in range(ndfCol): feature[i] = list(df[dfCol[i]]) print (dfCol[i],":", feature[i]) le = preprocessing.LabelEncoder() feature0 = [[]*ndfRow for x in range(ndfCol)] for i in range(ndfCol): feature0[i] = le.fit_transform(feature[i]) print(dfCol[i], "encoded:", feature0[i]) features = [] for i in range(ndfRow): xlst = [] for j in range(ndfCol-1): xlst.append(feature0[j][i]) xtup = tuple(xlst) features.append(xtup) print ("features:", features) label = feature0[:][ndfCol-1] label = [label[i]+1 for i in range(ndfRow)] print ("label:", label) model = GaussianNB() model.fit(features, label) print ("model:", model) ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ") ptLst = [int(x) for x in ptStr.split(',')] point1 = [ptLst] print ("Unknown data (sample):", point1) predicted= model.predict(point1) print ("Class for Point:", point1, "is:", predicted) OUTPUT (with input: 0,1,1,0): Age Income Student Credit Rating Buys-Computer Item no 1 Youth High No Fair No 2 Youth High No Excellent No 3 Middle High No Fair Yes 4 Senior Medium No Fair Yes ... [14 rows x 5 columns] df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object') Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...] Income : ['High', 'High', 'High', 'Medium', 'Low', ...] Student : ['No', 'No', 'No', 'No', 'Yes', ...] Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...] Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...] Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1] Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2] Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1] Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0] Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0] features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1), (1, 1, 1, 1), (0, 1, 1, 0), ...] label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1] model: GaussianNB() Enter unknown data (separated by ,) excluding Index Column: Unknown data (sample): [[0, 1, 1, 0]] Class for Point: [[0, 1, 1, 0]] is: [2] =============================================================================== PRACTICAL 5: LINEAR REGRESSION (Predicting continuous values with evaluation using MSE and R² score) =============================================================================== CODE: import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import numpy as np from sklearn import metrics dataset=pd.read_csv("LinearRegression.csv") print(dataset) x=dataset.iloc[:,0:1] y=dataset.iloc[:,1] y=y.replace(['Yes','No'],[1,0]) print(y) X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123) model = LinearRegression() model = model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_val=model.predict([[18]]) print(y_pred_val) if(y_pred_val > 0.5): print("Yes") else: print("No") plt.scatter(X_train,y_train, color = 'red') plt.plot(X_train, model.predict(X_train)) print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) OUTPUT: Outside Temperature \nCelcius Wear a\n jacket 0 30 No 1 25 No 2 20 No 3 15 Yes 4 10 Yes 0 0 1 0 2 0 3 1 4 1 Name: Wear a\n jacket, dtype: int64 [0.54285714] Yes Mean Absolute Error: 0.14285714285714302 Mean Squared Error: 0.02040816326530617 Root Mean Squared Error: 0.14285714285714302 =============================================================================== PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER (Classification using different k values with accuracy evaluation) =============================================================================== CODE: import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, classification_report df = pd.read_csv("knn.csv") df = df[df['Item no.'].notna()] print("Dataset Preview:") print(df.head()) X = df.iloc[:, 1:4] y = df.iloc[:, 4] print("INPUT\n",X) print("OUTPUT\n",y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) print("XTRAIN\n",X_train) print("X_TEST\n",X_test) knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, y_train) y_pred = knn.predict(X_test) print("PREDICTION : \n",y_pred) print("\nAccuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:") print(classification_report(y_test, y_pred)) OUTPUT: Dataset Preview: Item no. Temp Humidity Wind Speed Play ... 0 1.0 85.0 85.0 12.0 No ... 1 2.0 80.0 90.0 9.0 No ... 2 3.0 83.0 86.0 4.0 Yes ... 3 4.0 70.0 96.0 3.0 Yes ... 4 5.0 68.0 80.0 5.0 Yes ... INPUT Temp Humidity Wind Speed 0 85.0 85.0 12.0 1 80.0 90.0 9.0 2 83.0 86.0 4.0 3 70.0 96.0 3.0 4 68.0 80.0 5.0 ... [14 rows x 3 columns] OUTPUT 0 No 1 No 2 Yes 3 Yes 4 Yes ... Name: Play, dtype: object XTRAIN [[ 1.37690922 -0.53048047 -0.46006855] [-1.22885447 -0.99359834 2.25104967] [-0.57741354 -0.99359834 -0.46006855] [ 1.70262968 0.48837885 -0.64080976] ... [-1.3917147 -1.45671621 -1.00229219]] X_TEST [[ 0.39974784 -0.0673626 -1.00229219] [-0.08883285 0.85887314 -0.64080976] [ 2.02835014 0.39575527 0.80511996]] PREDICTION : ['Yes' 'Yes' 'Yes'] Accuracy: 0.6666666666666666 Classification Report: precision recall f1-score support No 0.00 0.00 0.00 1 Yes 0.67 1.00 0.80 2 accuracy 0.67 3 macro avg 0.33 0.50 0.40 3 weighted avg 0.44 0.67 0.53 3 =============================================================================== PRACTICAL 7: MULTIPLE LINEAR REGRESSION (Prediction using multiple features with R² score and RMSE evaluation) =============================================================================== CODE: import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import r2_score Data=pd.read_excel("student_data1.xlsx") print(Data) X=Data.iloc[:,:2] y=Data.iloc[:,-1:] print(X) print(y) X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42) print("Xtrain\n",X_train) print("XTEST\n",y_test) model = LinearRegression() model.fit(X_train.to_numpy(), y_train) y_pred = model.predict([[8.6,125]]) print("model prediction on Ytest:\n",y_pred.round(2)) print("M= ",model.coef_.round(2)) print("b= ",model.intercept_.round(2)) OUTPUT: CGPA IQ Placement (LPA) 0 7.5 110 6.5 1 8.0 120 7.0 2 8.5 125 8.2 3 9.0 130 9.1 4 6.5 100 5.0 ... [10 rows x 3 columns] CGPA IQ 0 7.5 110 1 8.0 120 2 8.5 125 3 9.0 130 4 6.5 100 ... [10 rows x 2 columns] Placement (LPA) 0 6.5 1 7.0 2 8.2 3 9.1 4 5.0 ... [10 rows x 1 columns] Xtrain CGPA IQ 5 7.0 105 0 7.5 110 7 8.8 128 2 8.5 125 ... [8 rows x 2 columns] XTEST Placement (LPA) 8 5.2 1 7.0 model prediction on Ytest: [[8.45]] M= [[1.32 0.03]] b= [-6.51] =============================================================================== PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD) (Dimensionality Reduction using SVD - Manual & Sklearn Implementation) =============================================================================== ------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) ------- CODE: import pandas as pd import numpy as np df = pd.read_excel("student_dataset.xlsx") A = df.iloc[:, :3].to_numpy() A_mean = A - np.mean(A, axis=0) U, X, V_T = np.linalg.svd(A_mean) k = 2 U_k = U[:, :k] S_k = np.diag(X[:k]) final_data1 = np.dot(U_k, S_k) print("Reduced Data:\n", final_data1) explained_variance = (X[:k]**2) / np.sum(X**2) print("Explained variance by top 2 components:", explained_variance) reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"]) reduced_df['GTU'] = df['GTU'].values print(reduced_df) OUTPUT: Reduced Data: [[ -2.60622042 0.08983428] [ 15.20533711 -2.22651162] [-16.15266994 0.28962606] [ 22.72992624 -0.77843972] [ 3.46193108 -0.87192496] ... [-12.83777493 0.27648847]] Explained variance by top 2 components: [0.99132896 0.00672569] PC1 PC2 GTU 0 -2.606220 0.089834 70 1 15.205337 -2.226512 88 2 -16.152670 0.289626 65 3 22.729926 -0.778440 92 4 3.461931 -0.871925 78 ... [15 rows x 3 columns] ------- 8.2: SVD USING SKLEARN (svd2.py) ------- CODE: import pandas as pd import numpy as np from sklearn.decomposition import TruncatedSVD from sklearn.preprocessing import StandardScaler df = pd.read_excel("student_dataset.xlsx") X = df.iloc[:, :3] scaler = StandardScaler() X_scaled = scaler.fit_transform(X) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_scaled) print(X_reduced) Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2']) Dataf['GTU Marks']=df['GTU'].values print(Dataf) print("Singular values:", svd.singular_values_) print("Explained variance:", svd.explained_variance_) print("Explained variance ratio:", svd.explained_variance_ratio_) print("Total variance captured:", svd.explained_variance_ratio_.sum()) OUTPUT: [[-0.29412273 -0.01659157] [ 1.77010531 -0.27352604] [-2.00150714 0.06824795] [ 2.74518022 -0.074903 ] [ 0.42847827 -0.1586513 ] ... [-1.5266755 -0.01597918]] PC1 PC2 GTU Marks 0 -0.294123 -0.016592 70 1 1.770105 -0.273526 88 2 -2.001507 0.068248 65 3 2.745180 -0.074903 92 4 0.428478 -0.158651 78 ... [15 rows x 3 columns] Singular values: [6.67619539 0.55259316] Explained variance: [2.97143899 0.02035728] Explained variance ratio: [0.99047966 0.00678576] Total variance captured: 0.997265423131314 =============================================================================== END OF JOURNAL ===============================================================================

1	===============================================================================
2	MACHINE LEARNING PRACTICALS - JOURNAL
3	===============================================================================
4
5	===============================================================================
6	PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING
7	(Handling missing values, Encoding categorical variables, Scaling features)
8	===============================================================================
9
10	------- 1.1: ENCODING (encoding.py) -------
11
12	CODE:
13	import pandas as pd
14	from sklearn.preprocessing import OrdinalEncoder
15	from sklearn.model_selection import train_test_split
16	from sklearn.preprocessing import LabelEncoder
17	from sklearn.preprocessing import OneHotEncoder
18
19	df=pd.read_csv("customer.csv")
20	print(df)
21
22	df1=df.iloc[:,2:]
23	print(df1)
24
25	x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1)
26	print("XTrain: \n",x_train)
27	print("Ytrain: \n",y_train)
28	print("XTEST: \n",x_test)
29	print("YTEST: \n",y_test)
30
31	#ordinal Encoding
32	oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']])
33	oe.fit(x_train)
34	x_train=oe.transform(x_train)
35	x_test=oe.transform(x_test)
36	print(x_train)
37
38	#Label Encoder
39	le=LabelEncoder()
40	le.fit(y_train)
41	y_train=le.transform(y_train)
42	y_test=le.transform(y_test)
43	print(y_train)
44
45	#Onehot Encoding using pandas
46	df2=df.iloc[:,1:2]
47	encod=OneHotEncoder(sparse_output=False)
48	encoded=encod.fit_transform(df2)
49	print("Feature Names:")
50	print(encod.get_feature_names_out())
51	print(encoded)
52
53	OUTPUT:
54	age Gender review education Purchase
55	0 NaN Male Good HSC yes
56	1 48.0 Male Good PG no
57	2 68.0 Female Average UG no
58	3 77.0 Female Average PG yes
59	4 26.0 Male Poor PG yes
60	...
61	[14 rows x 5 columns]
62
63	review education Purchase
64	0 Good HSC yes
65	1 Good PG no
66	2 Average UG no
67	3 Average PG yes
68	...
69	[14 rows x 3 columns]
70
71	XTrain:
72	review education
73	3 Average PG
74	8 Good UG
75	6 Good PG
76	2 Average UG
77	...
78	[12 rows x 2 columns]
79
80	Ytrain:
81	3 yes
82	8 yes
83	6 yes
84	2 no
85	...
86	Name: Purchase, dtype: object
87
88	XTEST:
89	review education
90	5 Good UG
91	13 Good UG
92
93	YTEST:
94	5 no
95	13 yes
96	Name: Purchase, dtype: object
97
98	[[1. 2.]
99	[2. 1.]
100	[2. 2.]
101	[1. 1.]
102	...
103	[0. 2.]]
104
105	[1 1 1 0 0 0 0 1 1 1 1 1]
106
107	Feature Names:
108	['Gender_Female' 'Gender_Male']
109
110	[[0. 1.]
111	[0. 1.]
112	[1. 0.]
113	[1. 0.]
114	...
115	[1. 0.]]
116
117	------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) -------
118
119	CODE:
120	import matplotlib.pyplot as plt
121	import numpy as np
122	arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110])
123	arr1=np.sort(arr)
124	print(arr1)
125	mean=np.mean(arr)
126	print("MEAN=",mean)
127	median=np.median(arr)
128	print("MEDIAN=",median)
129	q1=np.percentile(arr,25)
130	print("Quarter 1=",q1)
131	q3=np.percentile(arr1,75)
132	print("Quarter 3=",q3)
133	plt.boxplot(arr)
134	plt.show()
135	plt.hist(arr)
136	plt.show()
137
138	OUTPUT:
139	[100 110 110 110 120 120 130 140 140 150 170 220]
140	MEAN= 135.0
141	MEDIAN= 125.0
142	Quarter 1= 110.0
143	Quarter 3= 142.5
144
145	------- 1.3: CORRELATION WITH TARGET (corela_target.py) -------
146
147	CODE:
148	import pandas as pd
149
150	data = {
151	'sqft': [1500, 1600, 1700, 1800, 1900],
152	'rooms': [3, 3, 4, 4, 5],
153	'roof_color': [1, 2, 1, 2, 1],
154	'price': [300000, 320000, 340000, 360000, 380000]
155	}
156
157	df = pd.DataFrame(data)
158	correlation_matrix = df.corr(numeric_only=True)
159	print("🔁 Full Correlation Matrix:")
160	print(correlation_matrix.round(2))
161
162	correlation = df.corr()['price'].drop('price')
163	print(correlation)
164
165	selected_features = correlation[correlation.abs() > 0.3].index
166	print("Selected features:", list(selected_features))
167
168	OUTPUT:
169	🔁 Full Correlation Matrix:
170	sqft rooms roof_color price
171	sqft 1.00 0.94 0.00 1.00
172	rooms 0.94 1.00 -0.33 0.94
173	roof_color 0.00 -0.33 1.00 0.00
174	price 1.00 0.94 0.00 1.00
175
176	sqft 1.000000e+00
177	rooms 9.449112e-01
178	roof_color 5.250970e-17
179	Name: price, dtype: float64
180
181	Selected features: ['sqft', 'rooms']
182
183	------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) -------
184
185	CODE:
186	import pandas as pd
187	from sklearn.model_selection import train_test_split
188	from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder
189	from sklearn.compose import ColumnTransformer
190	from sklearn.impute import SimpleImputer
191
192	df = pd.read_csv("customer.csv")
193	print(df)
194
195	x=df.iloc[:,:4]
196	y=df.iloc[:,-1]
197
198	x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1)
199
200	trans = ColumnTransformer(
201	transformers=[
202	('impute_age', SimpleImputer(), ['age']),
203	('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']),
204	('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']),
205	('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education'])
206	],
207	remainder='passthrough'
208	)
209
210	x_train = trans.fit_transform(x_train)
211	x_test =trans.fit_transform(x_test)
212	print("\nTransformed XTrain:\n", x_train)
213	print("\nTransformed XTest:\n", x_test)
214
215	le=LabelEncoder()
216	y_train1=le.fit_transform(y_train)
217	y_test1=le.fit_transform(y_test)
218	print("\nTransformed YTrain:\n", y_train1)
219	print("\nTransformed YTest:\n", y_test1)
220
221	OUTPUT:
222	age Gender review education Purchase
223	0 NaN Male Good HSC yes
224	1 48.0 Male Good PG no
225	2 68.0 Female Average UG no
226	3 77.0 Female Average PG yes
227	4 26.0 Male Poor PG yes
228	...
229	[14 rows x 5 columns]
230
231	Transformed XTrain:
232	[[55. 0. 1. 0. 2.]
233	[18. 0. 1. 2. 1.]
234	[44. 0. 1. 2. 1.]
235	[50. 1. 0. 2. 1.]
236	...
237	[26. 0. 1. 0. 2.]]
238
239	Transformed XTest:
240	[[77. 1. 2. 0.]
241	[77. 1. 1. 2.]]
242
243	Transformed YTrain:
244	[1 0 1 1 1 0 1 0 0 0 1 1]
245
246	Transformed YTest:
247	[0 0]
248
249	------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) -------
250
251	CODE:
252	import pandas as pd
253
254	data = {
255	'sqft': [1500, 1600, 1700, 1800, 1900],
256	'rooms': [3, 3, 4, 4, 5],
257	'bathrooms': [1, 2, 2, 2, 3],
258	'roof_color': [1, 2, 1, 2, 1],
259	'price': [300000, 320000, 340000, 360000, 380000]
260	}
261	df = pd.DataFrame(data)
262	feature_corr = df.drop(columns='price').corr()
263	print("Correlation between features:")
264	print(feature_corr.round(2))
265
266	OUTPUT:
267	Correlation between features:
268	sqft rooms bathrooms roof_color
269	sqft 1.00 0.94 0.89 0.00
270	rooms 0.94 1.00 0.85 -0.33
271	bathrooms 0.89 0.85 1.00 0.00
272	roof_color 0.00 -0.33 0.00 1.00
273
274
275	===============================================================================
276	PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA)
277	(Dimensionality Reduction while retaining maximum variance)
278	===============================================================================
279
280	CODE:
281	import pandas as pd
282	import numpy as np
283	from sklearn.preprocessing import StandardScaler
284
285	df=pd.read_csv("student_dataset.csv")
286	print(df)
287
288	scaler=StandardScaler()
289	df1=scaler.fit_transform(df.iloc[:,:3])
290	print(df1)
291
292	cov_matrix = np.cov(df1.T)
293	print("COVARIANCE MATRIX:\n", cov_matrix)
294
295	eig_val,eig_vect=np.linalg.eig(cov_matrix)
296	print("\nEigen Values\n",eig_val)
297	print("Eigen Vectors\n",eig_vect)
298
299	pc = eig_vect[:,[0, 2]]
300	pc=pc.T
301	print("\nTop 2 Principal Components:\n", pc)
302
303	trans_df = np.dot(df1[:,0:3], pc.T)
304	print(" \nNew Transform\n",trans_df)
305
306	Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2'])
307	Dataf['GTU Marks']=df['GTU'].values
308	print(Dataf)
309
310	OUTPUT:
311	Mid_Sem IQ HSC GTU
312	0 35 110 78 70
313	1 42 125 85 88
314	2 28 100 72 65
315	3 45 130 90 92
316	4 38 115 80 78
317	...
318	[15 rows x 4 columns]
319
320	[[-0.09736702 -0.20785572 -0.20441405]
321	[ 1.03858157 1.20934235 0.81765621]
322	[-1.23331562 -1.15265443 -1.08047428]
323	[ 1.52541669 1.68174171 1.5477064 ]
324	...
325	[-0.74648051 -0.96369469 -0.93446424]]
326
327	COVARIANCE MATRIX:
328	[[1.07142857 1.0614152 1.05676449]
329	[1.0614152 1.07142857 1.05019437]
330	[1.05676449 1.05019437 1.07142857]]
331
332	Eigen Values
333	[3.18368463 0.00878971 0.02181137]
334
335	Eigen Vectors
336	[[-0.57842869 -0.7974863 -0.17156877]
337	[-0.57723546 0.54876897 -0.60469152]
338	[-0.57638483 0.25073535 0.77776109]]
339
340	Top 2 Principal Components:
341	[[-0.57842869 -0.57723546 -0.57638483]
342	[-0.17156877 -0.60469152 0.77776109]]
343
344	New Transform
345	[[ 0.29412273 -0.01659157]
346	[-1.77010531 -0.27352604]
347	[ 2.00150714 0.06824795]
348	[-2.74518022 -0.074903 ]
349	...
350	[ 1.5266755 -0.01597918]]
351
352	PC1 PC2 GTU Marks
353	0 0.294123 -0.016592 70
354	1 -1.770105 -0.273526 88
355	2 2.001507 0.068248 65
356	3 -2.745180 -0.074903 92
357	4 -0.428478 -0.158651 78
358	...
359	[15 rows x 3 columns]
360
361
362	===============================================================================
363	PRACTICAL 3: DECISION TREE CLASSIFIER
364	(Classification with evaluation using precision, recall, and F1-score)
365	===============================================================================
366
367	CODE:
368	import pandas as pd
369	from sklearn.metrics import confusion_matrix
370	from sklearn.tree import DecisionTreeClassifier
371	from sklearn.metrics import accuracy_score
372	from sklearn.metrics import classification_report
373
374	data = pd.read_csv("decesiontree.csv")
375	print(data)
376
377	cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2},
378	"Income": {"Low": 0, "Medium": 1, "High" : 2 },
379	"Student": {"No": 0, "Yes":1 },
380	"Credit Rating": { "Fair": 1, "Excellent" : 2 },
381	"Buys-Computer": {"No": 0, "Yes": 1}}
382	data.replace(cleanup_nums, inplace = True)
383	print(data)
384
385	predictors = data.iloc[:, 1:5]
386	target = data.iloc[:, 5]
387
388	dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,
389	max_depth=3,min_samples_leaf=5)
390
391	OUTPUT:
392	Item no Age Income Student Credit Rating Buys-Computer
393	0 1 Youth High No Fair No
394	1 2 Youth High No Excellent No
395	2 3 Middle High No Fair Yes
396	3 4 Senior Medium No Fair Yes
397	4 5 Senior Low Yes Fair Yes
398	...
399	[14 rows x 6 columns]
400
401	Item no Age Income Student Credit Rating Buys-Computer
402	0 1 0 2 0 1 0
403	1 2 0 2 0 2 0
404	2 3 1 2 0 1 1
405	3 4 2 1 0 1 1
406	4 5 2 0 1 1 1
407	...
408	[14 rows x 6 columns]
409
410
411	===============================================================================
412	PRACTICAL 4: NAIVE BAYES CLASSIFIER
413	(Probabilistic classification using Gaussian Naive Bayes)
414	===============================================================================
415
416	CODE:
417	import pandas as pd
418	from sklearn import preprocessing
419	from sklearn.naive_bayes import GaussianNB
420
421	fl = "Naive_Bayesian.csv"
422	df = pd.read_csv(fl, index_col = "Item no")
423	print (df)
424
425	dfCol = df.columns
426	print ("df columns: ", dfCol)
427	ndfCol = df.shape[1]
428	ndfRow = df.shape[0]
429
430	feature = [[]*ndfRow for x in range(ndfCol)]
431	for i in range(ndfCol):
432	feature[i] = list(df[dfCol[i]])
433	print (dfCol[i],":", feature[i])
434
435	le = preprocessing.LabelEncoder()
436
437	feature0 = [[]*ndfRow for x in range(ndfCol)]
438	for i in range(ndfCol):
439	feature0[i] = le.fit_transform(feature[i])
440	print(dfCol[i], "encoded:", feature0[i])
441
442	features = []
443	for i in range(ndfRow):
444	xlst = []
445	for j in range(ndfCol-1):
446	xlst.append(feature0[j][i])
447	xtup = tuple(xlst)
448	features.append(xtup)
449
450	print ("features:", features)
451
452	label = feature0[:][ndfCol-1]
453	label = [label[i]+1 for i in range(ndfRow)]
454	print ("label:", label)
455
456	model = GaussianNB()
457	model.fit(features, label)
458	print ("model:", model)
459
460	ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ")
461	ptLst = [int(x) for x in ptStr.split(',')]
462	point1 = [ptLst]
463	print ("Unknown data (sample):", point1)
464	predicted= model.predict(point1)
465	print ("Class for Point:", point1, "is:", predicted)
466
467	OUTPUT (with input: 0,1,1,0):
468	Age Income Student Credit Rating Buys-Computer
469	Item no
470	1 Youth High No Fair No
471	2 Youth High No Excellent No
472	3 Middle High No Fair Yes
473	4 Senior Medium No Fair Yes
474	...
475	[14 rows x 5 columns]
476
477	df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object')
478
479	Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...]
480	Income : ['High', 'High', 'High', 'Medium', 'Low', ...]
481	Student : ['No', 'No', 'No', 'No', 'Yes', ...]
482	Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...]
483	Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...]
484
485	Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1]
486	Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2]
487	Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1]
488	Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0]
489	Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]
490
491	features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1),
492	(1, 1, 1, 1), (0, 1, 1, 0), ...]
493
494	label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1]
495
496	model: GaussianNB()
497
498	Enter unknown data (separated by ,) excluding Index Column:
499	Unknown data (sample): [[0, 1, 1, 0]]
500	Class for Point: [[0, 1, 1, 0]] is: [2]
501
502
503	===============================================================================
504	PRACTICAL 5: LINEAR REGRESSION
505	(Predicting continuous values with evaluation using MSE and R² score)
506	===============================================================================
507
508	CODE:
509	import pandas as pd
510	import matplotlib.pyplot as plt
511	from sklearn.linear_model import LinearRegression
512	from sklearn.model_selection import train_test_split
513	import numpy as np
514	from sklearn import metrics
515
516	dataset=pd.read_csv("LinearRegression.csv")
517	print(dataset)
518
519	x=dataset.iloc[:,0:1]
520	y=dataset.iloc[:,1]
521	y=y.replace(['Yes','No'],[1,0])
522
523	print(y)
524	X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123)
525
526	model = LinearRegression()
527	model = model.fit(X_train, y_train)
528	y_pred = model.predict(X_test)
529	y_pred_val=model.predict([[18]])
530	print(y_pred_val)
531
532	if(y_pred_val > 0.5):
533	print("Yes")
534	else:
535	print("No")
536
537	plt.scatter(X_train,y_train, color = 'red')
538	plt.plot(X_train, model.predict(X_train))
539
540	print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
541	print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
542	print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
543
544	OUTPUT:
545	Outside Temperature \nCelcius Wear a\n jacket
546	0 30 No
547	1 25 No
548	2 20 No
549	3 15 Yes
550	4 10 Yes
551
552	0 0
553	1 0
554	2 0
555	3 1
556	4 1
557	Name: Wear a\n jacket, dtype: int64
558
559	[0.54285714]
560	Yes
561
562	Mean Absolute Error: 0.14285714285714302
563	Mean Squared Error: 0.02040816326530617
564	Root Mean Squared Error: 0.14285714285714302
565
566
567	===============================================================================
568	PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER
569	(Classification using different k values with accuracy evaluation)
570	===============================================================================
571
572	CODE:
573	import pandas as pd
574	from sklearn.model_selection import train_test_split
575	from sklearn.preprocessing import StandardScaler
576	from sklearn.neighbors import KNeighborsClassifier
577	from sklearn.metrics import accuracy_score, classification_report
578
579	df = pd.read_csv("knn.csv")
580	df = df[df['Item no.'].notna()]
581	print("Dataset Preview:")
582	print(df.head())
583
584	X = df.iloc[:, 1:4]
585	y = df.iloc[:, 4]
586	print("INPUT\n",X)
587	print("OUTPUT\n",y)
588
589	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
590
591	scaler = StandardScaler()
592	X_train = scaler.fit_transform(X_train)
593	X_test = scaler.transform(X_test)
594	print("XTRAIN\n",X_train)
595	print("X_TEST\n",X_test)
596
597	knn = KNeighborsClassifier(n_neighbors=3)
598	knn.fit(X_train, y_train)
599
600	y_pred = knn.predict(X_test)
601	print("PREDICTION : \n",y_pred)
602
603	print("\nAccuracy:", accuracy_score(y_test, y_pred))
604	print("\nClassification Report:")
605	print(classification_report(y_test, y_pred))
606
607	OUTPUT:
608	Dataset Preview:
609	Item no. Temp Humidity Wind Speed Play ...
610	0 1.0 85.0 85.0 12.0 No ...
611	1 2.0 80.0 90.0 9.0 No ...
612	2 3.0 83.0 86.0 4.0 Yes ...
613	3 4.0 70.0 96.0 3.0 Yes ...
614	4 5.0 68.0 80.0 5.0 Yes ...
615
616	INPUT
617	Temp Humidity Wind Speed
618	0 85.0 85.0 12.0
619	1 80.0 90.0 9.0
620	2 83.0 86.0 4.0
621	3 70.0 96.0 3.0
622	4 68.0 80.0 5.0
623	...
624	[14 rows x 3 columns]
625
626	OUTPUT
627	0 No
628	1 No
629	2 Yes
630	3 Yes
631	4 Yes
632	...
633	Name: Play, dtype: object
634
635	XTRAIN
636	[[ 1.37690922 -0.53048047 -0.46006855]
637	[-1.22885447 -0.99359834 2.25104967]
638	[-0.57741354 -0.99359834 -0.46006855]
639	[ 1.70262968 0.48837885 -0.64080976]
640	...
641	[-1.3917147 -1.45671621 -1.00229219]]
642
643	X_TEST
644	[[ 0.39974784 -0.0673626 -1.00229219]
645	[-0.08883285 0.85887314 -0.64080976]
646	[ 2.02835014 0.39575527 0.80511996]]
647
648	PREDICTION :
649	['Yes' 'Yes' 'Yes']
650
651	Accuracy: 0.6666666666666666
652
653	Classification Report:
654	precision recall f1-score support
655
656	No 0.00 0.00 0.00 1
657	Yes 0.67 1.00 0.80 2
658
659	accuracy 0.67 3
660	macro avg 0.33 0.50 0.40 3
661	weighted avg 0.44 0.67 0.53 3
662
663
664	===============================================================================
665	PRACTICAL 7: MULTIPLE LINEAR REGRESSION
666	(Prediction using multiple features with R² score and RMSE evaluation)
667	===============================================================================
668
669	CODE:
670	import pandas as pd
671	import numpy as np
672	import matplotlib.pyplot as plt
673	from sklearn.linear_model import LinearRegression
674	from sklearn.model_selection import train_test_split
675	from sklearn.metrics import r2_score
676
677	Data=pd.read_excel("student_data1.xlsx")
678	print(Data)
679
680	X=Data.iloc[:,:2]
681	y=Data.iloc[:,-1:]
682	print(X)
683	print(y)
684
685	X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42)
686
687	print("Xtrain\n",X_train)
688	print("XTEST\n",y_test)
689
690	model = LinearRegression()
691	model.fit(X_train.to_numpy(), y_train)
692
693	y_pred = model.predict([[8.6,125]])
694	print("model prediction on Ytest:\n",y_pred.round(2))
695
696	print("M= ",model.coef_.round(2))
697	print("b= ",model.intercept_.round(2))
698
699	OUTPUT:
700	CGPA IQ Placement (LPA)
701	0 7.5 110 6.5
702	1 8.0 120 7.0
703	2 8.5 125 8.2
704	3 9.0 130 9.1
705	4 6.5 100 5.0
706	...
707	[10 rows x 3 columns]
708
709	CGPA IQ
710	0 7.5 110
711	1 8.0 120
712	2 8.5 125
713	3 9.0 130
714	4 6.5 100
715	...
716	[10 rows x 2 columns]
717
718	Placement (LPA)
719	0 6.5
720	1 7.0
721	2 8.2
722	3 9.1
723	4 5.0
724	...
725	[10 rows x 1 columns]
726
727	Xtrain
728	CGPA IQ
729	5 7.0 105
730	0 7.5 110
731	7 8.8 128
732	2 8.5 125
733	...
734	[8 rows x 2 columns]
735
736	XTEST
737	Placement (LPA)
738	8 5.2
739	1 7.0
740
741	model prediction on Ytest:
742	[[8.45]]
743
744	M= [[1.32 0.03]]
745	b= [-6.51]
746
747
748	===============================================================================
749	PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD)
750	(Dimensionality Reduction using SVD - Manual & Sklearn Implementation)
751	===============================================================================
752
753	------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) -------
754
755	CODE:
756	import pandas as pd
757	import numpy as np
758
759	df = pd.read_excel("student_dataset.xlsx")
760	A = df.iloc[:, :3].to_numpy()
761	A_mean = A - np.mean(A, axis=0)
762
763	U, X, V_T = np.linalg.svd(A_mean)
764	k = 2
765	U_k = U[:, :k]
766	S_k = np.diag(X[:k])
767
768	final_data1 = np.dot(U_k, S_k)
769	print("Reduced Data:\n", final_data1)
770
771	explained_variance = (X[:k]2) / np.sum(X2)
772	print("Explained variance by top 2 components:", explained_variance)
773
774	reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"])
775	reduced_df['GTU'] = df['GTU'].values
776	print(reduced_df)
777
778	OUTPUT:
779	Reduced Data:
780	[[ -2.60622042 0.08983428]
781	[ 15.20533711 -2.22651162]
782	[-16.15266994 0.28962606]
783	[ 22.72992624 -0.77843972]
784	[ 3.46193108 -0.87192496]
785	...
786	[-12.83777493 0.27648847]]
787
788	Explained variance by top 2 components: [0.99132896 0.00672569]
789
790	PC1 PC2 GTU
791	0 -2.606220 0.089834 70
792	1 15.205337 -2.226512 88
793	2 -16.152670 0.289626 65
794	3 22.729926 -0.778440 92
795	4 3.461931 -0.871925 78
796	...
797	[15 rows x 3 columns]
798
799	------- 8.2: SVD USING SKLEARN (svd2.py) -------
800
801	CODE:
802	import pandas as pd
803	import numpy as np
804	from sklearn.decomposition import TruncatedSVD
805	from sklearn.preprocessing import StandardScaler
806
807	df = pd.read_excel("student_dataset.xlsx")
808	X = df.iloc[:, :3]
809
810	scaler = StandardScaler()
811	X_scaled = scaler.fit_transform(X)
812
813	svd = TruncatedSVD(n_components=2)
814	X_reduced = svd.fit_transform(X_scaled)
815	print(X_reduced)
816
817	Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2'])
818	Dataf['GTU Marks']=df['GTU'].values
819	print(Dataf)
820
821	print("Singular values:", svd.singular_values_)
822	print("Explained variance:", svd.explained_variance_)
823	print("Explained variance ratio:", svd.explained_variance_ratio_)
824	print("Total variance captured:", svd.explained_variance_ratio_.sum())
825
826	OUTPUT:
827	[[-0.29412273 -0.01659157]
828	[ 1.77010531 -0.27352604]
829	[-2.00150714 0.06824795]
830	[ 2.74518022 -0.074903 ]
831	[ 0.42847827 -0.1586513 ]
832	...
833	[-1.5266755 -0.01597918]]
834
835	PC1 PC2 GTU Marks
836	0 -0.294123 -0.016592 70
837	1 1.770105 -0.273526 88
838	2 -2.001507 0.068248 65
839	3 2.745180 -0.074903 92
840	4 0.428478 -0.158651 78
841	...
842	[15 rows x 3 columns]
843
844	Singular values: [6.67619539 0.55259316]
845	Explained variance: [2.97143899 0.02035728]
846	Explained variance ratio: [0.99047966 0.00678576]
847	Total variance captured: 0.997265423131314
848
849
850	===============================================================================
851	END OF JOURNAL
852	===============================================================================