Última atividade 1 month ago

Revisão bd74f766bcc7588c5fcf01a25752dc1d72bc8f99

practicals.txt Bruto
1===============================================================================
2 MACHINE LEARNING PRACTICALS - JOURNAL
3===============================================================================
4
5===============================================================================
6PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING
7(Handling missing values, Encoding categorical variables, Scaling features)
8===============================================================================
9
10------- 1.1: ENCODING (encoding.py) -------
11
12CODE:
13import pandas as pd
14from sklearn.preprocessing import OrdinalEncoder
15from sklearn.model_selection import train_test_split
16from sklearn.preprocessing import LabelEncoder
17from sklearn.preprocessing import OneHotEncoder
18
19df=pd.read_csv("customer.csv")
20print(df)
21
22df1=df.iloc[:,2:]
23print(df1)
24
25x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1)
26print("XTrain: \n",x_train)
27print("Ytrain: \n",y_train)
28print("XTEST: \n",x_test)
29print("YTEST: \n",y_test)
30
31#ordinal Encoding
32oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']])
33oe.fit(x_train)
34x_train=oe.transform(x_train)
35x_test=oe.transform(x_test)
36print(x_train)
37
38#Label Encoder
39le=LabelEncoder()
40le.fit(y_train)
41y_train=le.transform(y_train)
42y_test=le.transform(y_test)
43print(y_train)
44
45#Onehot Encoding using pandas
46df2=df.iloc[:,1:2]
47encod=OneHotEncoder(sparse_output=False)
48encoded=encod.fit_transform(df2)
49print("Feature Names:")
50print(encod.get_feature_names_out())
51print(encoded)
52
53OUTPUT:
54age Gender review education Purchase
550 NaN Male Good HSC yes
561 48.0 Male Good PG no
572 68.0 Female Average UG no
583 77.0 Female Average PG yes
594 26.0 Male Poor PG yes
60...
61[14 rows x 5 columns]
62
63 review education Purchase
640 Good HSC yes
651 Good PG no
662 Average UG no
673 Average PG yes
68...
69[14 rows x 3 columns]
70
71XTrain:
72 review education
733 Average PG
748 Good UG
756 Good PG
762 Average UG
77...
78[12 rows x 2 columns]
79
80Ytrain:
81 3 yes
828 yes
836 yes
842 no
85...
86Name: Purchase, dtype: object
87
88XTEST:
89 review education
905 Good UG
9113 Good UG
92
93YTEST:
94 5 no
9513 yes
96Name: Purchase, dtype: object
97
98[[1. 2.]
99 [2. 1.]
100 [2. 2.]
101 [1. 1.]
102 ...
103 [0. 2.]]
104
105[1 1 1 0 0 0 0 1 1 1 1 1]
106
107Feature Names:
108['Gender_Female' 'Gender_Male']
109
110[[0. 1.]
111 [0. 1.]
112 [1. 0.]
113 [1. 0.]
114 ...
115 [1. 0.]]
116
117------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) -------
118
119CODE:
120import matplotlib.pyplot as plt
121import numpy as np
122arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110])
123arr1=np.sort(arr)
124print(arr1)
125mean=np.mean(arr)
126print("MEAN=",mean)
127median=np.median(arr)
128print("MEDIAN=",median)
129q1=np.percentile(arr,25)
130print("Quarter 1=",q1)
131q3=np.percentile(arr1,75)
132print("Quarter 3=",q3)
133plt.boxplot(arr)
134plt.show()
135plt.hist(arr)
136plt.show()
137
138OUTPUT:
139[100 110 110 110 120 120 130 140 140 150 170 220]
140MEAN= 135.0
141MEDIAN= 125.0
142Quarter 1= 110.0
143Quarter 3= 142.5
144
145------- 1.3: CORRELATION WITH TARGET (corela_target.py) -------
146
147CODE:
148import pandas as pd
149
150data = {
151 'sqft': [1500, 1600, 1700, 1800, 1900],
152 'rooms': [3, 3, 4, 4, 5],
153 'roof_color': [1, 2, 1, 2, 1],
154 'price': [300000, 320000, 340000, 360000, 380000]
155}
156
157df = pd.DataFrame(data)
158correlation_matrix = df.corr(numeric_only=True)
159print("🔁 Full Correlation Matrix:")
160print(correlation_matrix.round(2))
161
162correlation = df.corr()['price'].drop('price')
163print(correlation)
164
165selected_features = correlation[correlation.abs() > 0.3].index
166print("Selected features:", list(selected_features))
167
168OUTPUT:
169🔁 Full Correlation Matrix:
170 sqft rooms roof_color price
171sqft 1.00 0.94 0.00 1.00
172rooms 0.94 1.00 -0.33 0.94
173roof_color 0.00 -0.33 1.00 0.00
174price 1.00 0.94 0.00 1.00
175
176sqft 1.000000e+00
177rooms 9.449112e-01
178roof_color 5.250970e-17
179Name: price, dtype: float64
180
181Selected features: ['sqft', 'rooms']
182
183------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) -------
184
185CODE:
186import pandas as pd
187from sklearn.model_selection import train_test_split
188from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder
189from sklearn.compose import ColumnTransformer
190from sklearn.impute import SimpleImputer
191
192df = pd.read_csv("customer.csv")
193print(df)
194
195x=df.iloc[:,:4]
196y=df.iloc[:,-1]
197
198x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1)
199
200trans = ColumnTransformer(
201 transformers=[
202 ('impute_age', SimpleImputer(), ['age']),
203 ('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']),
204 ('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']),
205 ('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education'])
206 ],
207 remainder='passthrough'
208)
209
210x_train = trans.fit_transform(x_train)
211x_test =trans.fit_transform(x_test)
212print("\nTransformed XTrain:\n", x_train)
213print("\nTransformed XTest:\n", x_test)
214
215le=LabelEncoder()
216y_train1=le.fit_transform(y_train)
217y_test1=le.fit_transform(y_test)
218print("\nTransformed YTrain:\n", y_train1)
219print("\nTransformed YTest:\n", y_test1)
220
221OUTPUT:
222 age Gender review education Purchase
2230 NaN Male Good HSC yes
2241 48.0 Male Good PG no
2252 68.0 Female Average UG no
2263 77.0 Female Average PG yes
2274 26.0 Male Poor PG yes
228...
229[14 rows x 5 columns]
230
231Transformed XTrain:
232 [[55. 0. 1. 0. 2.]
233 [18. 0. 1. 2. 1.]
234 [44. 0. 1. 2. 1.]
235 [50. 1. 0. 2. 1.]
236 ...
237 [26. 0. 1. 0. 2.]]
238
239Transformed XTest:
240 [[77. 1. 2. 0.]
241 [77. 1. 1. 2.]]
242
243Transformed YTrain:
244 [1 0 1 1 1 0 1 0 0 0 1 1]
245
246Transformed YTest:
247 [0 0]
248
249------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) -------
250
251CODE:
252import pandas as pd
253
254data = {
255 'sqft': [1500, 1600, 1700, 1800, 1900],
256 'rooms': [3, 3, 4, 4, 5],
257 'bathrooms': [1, 2, 2, 2, 3],
258 'roof_color': [1, 2, 1, 2, 1],
259 'price': [300000, 320000, 340000, 360000, 380000]
260}
261df = pd.DataFrame(data)
262feature_corr = df.drop(columns='price').corr()
263print("Correlation between features:")
264print(feature_corr.round(2))
265
266OUTPUT:
267Correlation between features:
268 sqft rooms bathrooms roof_color
269sqft 1.00 0.94 0.89 0.00
270rooms 0.94 1.00 0.85 -0.33
271bathrooms 0.89 0.85 1.00 0.00
272roof_color 0.00 -0.33 0.00 1.00
273
274
275===============================================================================
276PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA)
277(Dimensionality Reduction while retaining maximum variance)
278===============================================================================
279
280CODE:
281import pandas as pd
282import numpy as np
283from sklearn.preprocessing import StandardScaler
284
285df=pd.read_csv("student_dataset.csv")
286print(df)
287
288scaler=StandardScaler()
289df1=scaler.fit_transform(df.iloc[:,:3])
290print(df1)
291
292cov_matrix = np.cov(df1.T)
293print("COVARIANCE MATRIX:\n", cov_matrix)
294
295eig_val,eig_vect=np.linalg.eig(cov_matrix)
296print("\nEigen Values\n",eig_val)
297print("Eigen Vectors\n",eig_vect)
298
299pc = eig_vect[:,[0, 2]]
300pc=pc.T
301print("\nTop 2 Principal Components:\n", pc)
302
303trans_df = np.dot(df1[:,0:3], pc.T)
304print(" \nNew Transform\n",trans_df)
305
306Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2'])
307Dataf['GTU Marks']=df['GTU'].values
308print(Dataf)
309
310OUTPUT:
311 Mid_Sem IQ HSC GTU
3120 35 110 78 70
3131 42 125 85 88
3142 28 100 72 65
3153 45 130 90 92
3164 38 115 80 78
317...
318[15 rows x 4 columns]
319
320[[-0.09736702 -0.20785572 -0.20441405]
321 [ 1.03858157 1.20934235 0.81765621]
322 [-1.23331562 -1.15265443 -1.08047428]
323 [ 1.52541669 1.68174171 1.5477064 ]
324 ...
325 [-0.74648051 -0.96369469 -0.93446424]]
326
327COVARIANCE MATRIX:
328 [[1.07142857 1.0614152 1.05676449]
329 [1.0614152 1.07142857 1.05019437]
330 [1.05676449 1.05019437 1.07142857]]
331
332Eigen Values
333 [3.18368463 0.00878971 0.02181137]
334
335Eigen Vectors
336 [[-0.57842869 -0.7974863 -0.17156877]
337 [-0.57723546 0.54876897 -0.60469152]
338 [-0.57638483 0.25073535 0.77776109]]
339
340Top 2 Principal Components:
341 [[-0.57842869 -0.57723546 -0.57638483]
342 [-0.17156877 -0.60469152 0.77776109]]
343
344New Transform
345 [[ 0.29412273 -0.01659157]
346 [-1.77010531 -0.27352604]
347 [ 2.00150714 0.06824795]
348 [-2.74518022 -0.074903 ]
349 ...
350 [ 1.5266755 -0.01597918]]
351
352 PC1 PC2 GTU Marks
3530 0.294123 -0.016592 70
3541 -1.770105 -0.273526 88
3552 2.001507 0.068248 65
3563 -2.745180 -0.074903 92
3574 -0.428478 -0.158651 78
358...
359[15 rows x 3 columns]
360
361
362===============================================================================
363PRACTICAL 3: DECISION TREE CLASSIFIER
364(Classification with evaluation using precision, recall, and F1-score)
365===============================================================================
366
367CODE:
368import pandas as pd
369from sklearn.metrics import confusion_matrix
370from sklearn.tree import DecisionTreeClassifier
371from sklearn.metrics import accuracy_score
372from sklearn.metrics import classification_report
373
374data = pd.read_csv("decesiontree.csv")
375print(data)
376
377cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2},
378 "Income": {"Low": 0, "Medium": 1, "High" : 2 },
379 "Student": {"No": 0, "Yes":1 },
380 "Credit Rating": { "Fair": 1, "Excellent" : 2 },
381 "Buys-Computer": {"No": 0, "Yes": 1}}
382data.replace(cleanup_nums, inplace = True)
383print(data)
384
385predictors = data.iloc[:, 1:5]
386target = data.iloc[:, 5]
387
388dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,
389 max_depth=3,min_samples_leaf=5)
390
391OUTPUT:
392 Item no Age Income Student Credit Rating Buys-Computer
3930 1 Youth High No Fair No
3941 2 Youth High No Excellent No
3952 3 Middle High No Fair Yes
3963 4 Senior Medium No Fair Yes
3974 5 Senior Low Yes Fair Yes
398...
399[14 rows x 6 columns]
400
401 Item no Age Income Student Credit Rating Buys-Computer
4020 1 0 2 0 1 0
4031 2 0 2 0 2 0
4042 3 1 2 0 1 1
4053 4 2 1 0 1 1
4064 5 2 0 1 1 1
407...
408[14 rows x 6 columns]
409
410
411===============================================================================
412PRACTICAL 4: NAIVE BAYES CLASSIFIER
413(Probabilistic classification using Gaussian Naive Bayes)
414===============================================================================
415
416CODE:
417import pandas as pd
418from sklearn import preprocessing
419from sklearn.naive_bayes import GaussianNB
420
421fl = "Naive_Bayesian.csv"
422df = pd.read_csv(fl, index_col = "Item no")
423print (df)
424
425dfCol = df.columns
426print ("df columns: ", dfCol)
427ndfCol = df.shape[1]
428ndfRow = df.shape[0]
429
430feature = [[]*ndfRow for x in range(ndfCol)]
431for i in range(ndfCol):
432 feature[i] = list(df[dfCol[i]])
433 print (dfCol[i],":", feature[i])
434
435le = preprocessing.LabelEncoder()
436
437feature0 = [[]*ndfRow for x in range(ndfCol)]
438for i in range(ndfCol):
439 feature0[i] = le.fit_transform(feature[i])
440 print(dfCol[i], "encoded:", feature0[i])
441
442features = []
443for i in range(ndfRow):
444 xlst = []
445 for j in range(ndfCol-1):
446 xlst.append(feature0[j][i])
447 xtup = tuple(xlst)
448 features.append(xtup)
449
450print ("features:", features)
451
452label = feature0[:][ndfCol-1]
453label = [label[i]+1 for i in range(ndfRow)]
454print ("label:", label)
455
456model = GaussianNB()
457model.fit(features, label)
458print ("model:", model)
459
460ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ")
461ptLst = [int(x) for x in ptStr.split(',')]
462point1 = [ptLst]
463print ("Unknown data (sample):", point1)
464predicted= model.predict(point1)
465print ("Class for Point:", point1, "is:", predicted)
466
467OUTPUT (with input: 0,1,1,0):
468 Age Income Student Credit Rating Buys-Computer
469Item no
4701 Youth High No Fair No
4712 Youth High No Excellent No
4723 Middle High No Fair Yes
4734 Senior Medium No Fair Yes
474...
475[14 rows x 5 columns]
476
477df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object')
478
479Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...]
480Income : ['High', 'High', 'High', 'Medium', 'Low', ...]
481Student : ['No', 'No', 'No', 'No', 'Yes', ...]
482Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...]
483Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...]
484
485Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1]
486Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2]
487Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1]
488Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0]
489Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]
490
491features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1),
492 (1, 1, 1, 1), (0, 1, 1, 0), ...]
493
494label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1]
495
496model: GaussianNB()
497
498Enter unknown data (separated by ,) excluding Index Column:
499Unknown data (sample): [[0, 1, 1, 0]]
500Class for Point: [[0, 1, 1, 0]] is: [2]
501
502
503===============================================================================
504PRACTICAL 5: LINEAR REGRESSION
505(Predicting continuous values with evaluation using MSE and R² score)
506===============================================================================
507
508CODE:
509import pandas as pd
510import matplotlib.pyplot as plt
511from sklearn.linear_model import LinearRegression
512from sklearn.model_selection import train_test_split
513import numpy as np
514from sklearn import metrics
515
516dataset=pd.read_csv("LinearRegression.csv")
517print(dataset)
518
519x=dataset.iloc[:,0:1]
520y=dataset.iloc[:,1]
521y=y.replace(['Yes','No'],[1,0])
522
523print(y)
524X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123)
525
526model = LinearRegression()
527model = model.fit(X_train, y_train)
528y_pred = model.predict(X_test)
529y_pred_val=model.predict([[18]])
530print(y_pred_val)
531
532if(y_pred_val > 0.5):
533 print("Yes")
534else:
535 print("No")
536
537plt.scatter(X_train,y_train, color = 'red')
538plt.plot(X_train, model.predict(X_train))
539
540print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
541print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
542print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
543
544OUTPUT:
545Outside Temperature \nCelcius Wear a\n jacket
5460 30 No
5471 25 No
5482 20 No
5493 15 Yes
5504 10 Yes
551
5520 0
5531 0
5542 0
5553 1
5564 1
557Name: Wear a\n jacket, dtype: int64
558
559[0.54285714]
560Yes
561
562Mean Absolute Error: 0.14285714285714302
563Mean Squared Error: 0.02040816326530617
564Root Mean Squared Error: 0.14285714285714302
565
566
567===============================================================================
568PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER
569(Classification using different k values with accuracy evaluation)
570===============================================================================
571
572CODE:
573import pandas as pd
574from sklearn.model_selection import train_test_split
575from sklearn.preprocessing import StandardScaler
576from sklearn.neighbors import KNeighborsClassifier
577from sklearn.metrics import accuracy_score, classification_report
578
579df = pd.read_csv("knn.csv")
580df = df[df['Item no.'].notna()]
581print("Dataset Preview:")
582print(df.head())
583
584X = df.iloc[:, 1:4]
585y = df.iloc[:, 4]
586print("INPUT\n",X)
587print("OUTPUT\n",y)
588
589X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
590
591scaler = StandardScaler()
592X_train = scaler.fit_transform(X_train)
593X_test = scaler.transform(X_test)
594print("XTRAIN\n",X_train)
595print("X_TEST\n",X_test)
596
597knn = KNeighborsClassifier(n_neighbors=3)
598knn.fit(X_train, y_train)
599
600y_pred = knn.predict(X_test)
601print("PREDICTION : \n",y_pred)
602
603print("\nAccuracy:", accuracy_score(y_test, y_pred))
604print("\nClassification Report:")
605print(classification_report(y_test, y_pred))
606
607OUTPUT:
608Dataset Preview:
609 Item no. Temp Humidity Wind Speed Play ...
6100 1.0 85.0 85.0 12.0 No ...
6111 2.0 80.0 90.0 9.0 No ...
6122 3.0 83.0 86.0 4.0 Yes ...
6133 4.0 70.0 96.0 3.0 Yes ...
6144 5.0 68.0 80.0 5.0 Yes ...
615
616INPUT
617 Temp Humidity Wind Speed
6180 85.0 85.0 12.0
6191 80.0 90.0 9.0
6202 83.0 86.0 4.0
6213 70.0 96.0 3.0
6224 68.0 80.0 5.0
623...
624[14 rows x 3 columns]
625
626OUTPUT
627 0 No
6281 No
6292 Yes
6303 Yes
6314 Yes
632...
633Name: Play, dtype: object
634
635XTRAIN
636 [[ 1.37690922 -0.53048047 -0.46006855]
637 [-1.22885447 -0.99359834 2.25104967]
638 [-0.57741354 -0.99359834 -0.46006855]
639 [ 1.70262968 0.48837885 -0.64080976]
640 ...
641 [-1.3917147 -1.45671621 -1.00229219]]
642
643X_TEST
644 [[ 0.39974784 -0.0673626 -1.00229219]
645 [-0.08883285 0.85887314 -0.64080976]
646 [ 2.02835014 0.39575527 0.80511996]]
647
648PREDICTION :
649 ['Yes' 'Yes' 'Yes']
650
651Accuracy: 0.6666666666666666
652
653Classification Report:
654 precision recall f1-score support
655
656 No 0.00 0.00 0.00 1
657 Yes 0.67 1.00 0.80 2
658
659 accuracy 0.67 3
660 macro avg 0.33 0.50 0.40 3
661weighted avg 0.44 0.67 0.53 3
662
663
664===============================================================================
665PRACTICAL 7: MULTIPLE LINEAR REGRESSION
666(Prediction using multiple features with R² score and RMSE evaluation)
667===============================================================================
668
669CODE:
670import pandas as pd
671import numpy as np
672import matplotlib.pyplot as plt
673from sklearn.linear_model import LinearRegression
674from sklearn.model_selection import train_test_split
675from sklearn.metrics import r2_score
676
677Data=pd.read_excel("student_data1.xlsx")
678print(Data)
679
680X=Data.iloc[:,:2]
681y=Data.iloc[:,-1:]
682print(X)
683print(y)
684
685X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42)
686
687print("Xtrain\n",X_train)
688print("XTEST\n",y_test)
689
690model = LinearRegression()
691model.fit(X_train.to_numpy(), y_train)
692
693y_pred = model.predict([[8.6,125]])
694print("model prediction on Ytest:\n",y_pred.round(2))
695
696print("M= ",model.coef_.round(2))
697print("b= ",model.intercept_.round(2))
698
699OUTPUT:
700CGPA IQ Placement (LPA)
7010 7.5 110 6.5
7021 8.0 120 7.0
7032 8.5 125 8.2
7043 9.0 130 9.1
7054 6.5 100 5.0
706...
707[10 rows x 3 columns]
708
709 CGPA IQ
7100 7.5 110
7111 8.0 120
7122 8.5 125
7133 9.0 130
7144 6.5 100
715...
716[10 rows x 2 columns]
717
718 Placement (LPA)
7190 6.5
7201 7.0
7212 8.2
7223 9.1
7234 5.0
724...
725[10 rows x 1 columns]
726
727Xtrain
728 CGPA IQ
7295 7.0 105
7300 7.5 110
7317 8.8 128
7322 8.5 125
733...
734[8 rows x 2 columns]
735
736XTEST
737 Placement (LPA)
7388 5.2
7391 7.0
740
741model prediction on Ytest:
742 [[8.45]]
743
744M= [[1.32 0.03]]
745b= [-6.51]
746
747
748===============================================================================
749PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD)
750(Dimensionality Reduction using SVD - Manual & Sklearn Implementation)
751===============================================================================
752
753------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) -------
754
755CODE:
756import pandas as pd
757import numpy as np
758
759df = pd.read_excel("student_dataset.xlsx")
760A = df.iloc[:, :3].to_numpy()
761A_mean = A - np.mean(A, axis=0)
762
763U, X, V_T = np.linalg.svd(A_mean)
764k = 2
765U_k = U[:, :k]
766S_k = np.diag(X[:k])
767
768final_data1 = np.dot(U_k, S_k)
769print("Reduced Data:\n", final_data1)
770
771explained_variance = (X[:k]**2) / np.sum(X**2)
772print("Explained variance by top 2 components:", explained_variance)
773
774reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"])
775reduced_df['GTU'] = df['GTU'].values
776print(reduced_df)
777
778OUTPUT:
779Reduced Data:
780 [[ -2.60622042 0.08983428]
781 [ 15.20533711 -2.22651162]
782 [-16.15266994 0.28962606]
783 [ 22.72992624 -0.77843972]
784 [ 3.46193108 -0.87192496]
785 ...
786 [-12.83777493 0.27648847]]
787
788Explained variance by top 2 components: [0.99132896 0.00672569]
789
790 PC1 PC2 GTU
7910 -2.606220 0.089834 70
7921 15.205337 -2.226512 88
7932 -16.152670 0.289626 65
7943 22.729926 -0.778440 92
7954 3.461931 -0.871925 78
796...
797[15 rows x 3 columns]
798
799------- 8.2: SVD USING SKLEARN (svd2.py) -------
800
801CODE:
802import pandas as pd
803import numpy as np
804from sklearn.decomposition import TruncatedSVD
805from sklearn.preprocessing import StandardScaler
806
807df = pd.read_excel("student_dataset.xlsx")
808X = df.iloc[:, :3]
809
810scaler = StandardScaler()
811X_scaled = scaler.fit_transform(X)
812
813svd = TruncatedSVD(n_components=2)
814X_reduced = svd.fit_transform(X_scaled)
815print(X_reduced)
816
817Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2'])
818Dataf['GTU Marks']=df['GTU'].values
819print(Dataf)
820
821print("Singular values:", svd.singular_values_)
822print("Explained variance:", svd.explained_variance_)
823print("Explained variance ratio:", svd.explained_variance_ratio_)
824print("Total variance captured:", svd.explained_variance_ratio_.sum())
825
826OUTPUT:
827[[-0.29412273 -0.01659157]
828 [ 1.77010531 -0.27352604]
829 [-2.00150714 0.06824795]
830 [ 2.74518022 -0.074903 ]
831 [ 0.42847827 -0.1586513 ]
832 ...
833 [-1.5266755 -0.01597918]]
834
835 PC1 PC2 GTU Marks
8360 -0.294123 -0.016592 70
8371 1.770105 -0.273526 88
8382 -2.001507 0.068248 65
8393 2.745180 -0.074903 92
8404 0.428478 -0.158651 78
841...
842[15 rows x 3 columns]
843
844Singular values: [6.67619539 0.55259316]
845Explained variance: [2.97143899 0.02035728]
846Explained variance ratio: [0.99047966 0.00678576]
847Total variance captured: 0.997265423131314
848
849
850===============================================================================
851 END OF JOURNAL
852===============================================================================