Revision of practicals.txt

1

+

===============================================================================

2

+

MACHINE LEARNING PRACTICALS - JOURNAL

3

+

===============================================================================

4

+

5

+

===============================================================================

6

+

PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING

7

+

(Handling missing values, Encoding categorical variables, Scaling features)

8

+

===============================================================================

9

+

10

+

------- 1.1: ENCODING (encoding.py) -------

11

+

12

+

CODE:

13

+

import pandas as pd

14

+

from sklearn.preprocessing import OrdinalEncoder

15

+

from sklearn.model_selection import train_test_split

16

+

from sklearn.preprocessing import LabelEncoder

17

+

from sklearn.preprocessing import OneHotEncoder

18

+

19

+

df=pd.read_csv("customer.csv")

20

+

print(df)

21

+

22

+

df1=df.iloc[:,2:]

23

+

print(df1)

24

+

25

+

x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1)

26

+

print("XTrain: \n",x_train)

27

+

print("Ytrain: \n",y_train)

28

+

print("XTEST: \n",x_test)

29

+

print("YTEST: \n",y_test)

30

+

31

+

#ordinal Encoding

32

+

oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']])

33

+

oe.fit(x_train)

34

+

x_train=oe.transform(x_train)

35

+

x_test=oe.transform(x_test)

36

+

print(x_train)

37

+

38

+

#Label Encoder

39

+

le=LabelEncoder()

40

+

le.fit(y_train)

41

+

y_train=le.transform(y_train)

42

+

y_test=le.transform(y_test)

43

+

print(y_train)

44

+

45

+

#Onehot Encoding using pandas

46

+

df2=df.iloc[:,1:2]

47

+

encod=OneHotEncoder(sparse_output=False)

48

+

encoded=encod.fit_transform(df2)

49

+

print("Feature Names:")

50

+

print(encod.get_feature_names_out())

51

+

print(encoded)

52

+

53

+

OUTPUT:

54

+

age Gender review education Purchase

55

+

0 NaN Male Good HSC yes

56

+

1 48.0 Male Good PG no

57

+

2 68.0 Female Average UG no

58

+

3 77.0 Female Average PG yes

59

+

4 26.0 Male Poor PG yes

60

+

...

61

+

[14 rows x 5 columns]

62

+

63

+

review education Purchase

64

+

0 Good HSC yes

65

+

1 Good PG no

66

+

2 Average UG no

67

+

3 Average PG yes

68

+

...

69

+

[14 rows x 3 columns]

70

+

71

+

XTrain:

72

+

review education

73

+

3 Average PG

74

+

8 Good UG

75

+

6 Good PG

76

+

2 Average UG

77

+

...

78

+

[12 rows x 2 columns]

79

+

80

+

Ytrain:

81

+

3 yes

82

+

8 yes

83

+

6 yes

84

+

2 no

85

+

...

86

+

Name: Purchase, dtype: object

87

+

88

+

XTEST:

89

+

review education

90

+

5 Good UG

91

+

13 Good UG

92

+

93

+

YTEST:

94

+

5 no

95

+

13 yes

96

+

Name: Purchase, dtype: object

97

+

98

+

[[1. 2.]

99

+

[2. 1.]

100

+

[2. 2.]

101

+

[1. 1.]

102

+

...

103

+

[0. 2.]]

104

+

105

+

[1 1 1 0 0 0 0 1 1 1 1 1]

106

+

107

+

Feature Names:

108

+

['Gender_Female' 'Gender_Male']

109

+

110

+

[[0. 1.]

111

+

[0. 1.]

112

+

[1. 0.]

113

+

[1. 0.]

114

+

...

115

+

[1. 0.]]

116

+

117

+

------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) -------

118

+

119

+

CODE:

120

+

import matplotlib.pyplot as plt

121

+

import numpy as np

122

+

arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110])

123

+

arr1=np.sort(arr)

124

+

print(arr1)

125

+

mean=np.mean(arr)

126

+

print("MEAN=",mean)

127

+

median=np.median(arr)

128

+

print("MEDIAN=",median)

129

+

q1=np.percentile(arr,25)

130

+

print("Quarter 1=",q1)

131

+

q3=np.percentile(arr1,75)

132

+

print("Quarter 3=",q3)

133

+

plt.boxplot(arr)

134

+

plt.show()

135

+

plt.hist(arr)

136

+

plt.show()

137

+

138

+

OUTPUT:

139

+

[100 110 110 110 120 120 130 140 140 150 170 220]

140

+

MEAN= 135.0

141

+

MEDIAN= 125.0

142

+

Quarter 1= 110.0

143

+

Quarter 3= 142.5

144

+

145

+

------- 1.3: CORRELATION WITH TARGET (corela_target.py) -------

146

+

147

+

CODE:

148

+

import pandas as pd

149

+

150

+

data = {

151

+

'sqft': [1500, 1600, 1700, 1800, 1900],

152

+

'rooms': [3, 3, 4, 4, 5],

153

+

'roof_color': [1, 2, 1, 2, 1],

154

+

'price': [300000, 320000, 340000, 360000, 380000]

155

+

}

156

+

157

+

df = pd.DataFrame(data)

158

+

correlation_matrix = df.corr(numeric_only=True)

159

+

print("🔁 Full Correlation Matrix:")

160

+

print(correlation_matrix.round(2))

161

+

162

+

correlation = df.corr()['price'].drop('price')

163

+

print(correlation)

164

+

165

+

selected_features = correlation[correlation.abs() > 0.3].index

166

+

print("Selected features:", list(selected_features))

167

+

168

+

OUTPUT:

169

+

🔁 Full Correlation Matrix:

170

+

sqft rooms roof_color price

171

+

sqft 1.00 0.94 0.00 1.00

172

+

rooms 0.94 1.00 -0.33 0.94

173

+

roof_color 0.00 -0.33 1.00 0.00

174

+

price 1.00 0.94 0.00 1.00

175

+

176

+

sqft 1.000000e+00

177

+

rooms 9.449112e-01

178

+

roof_color 5.250970e-17

179

+

Name: price, dtype: float64

180

+

181

+

Selected features: ['sqft', 'rooms']

182

+

183

+

------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) -------

184

+

185

+

CODE:

186

+

import pandas as pd

187

+

from sklearn.model_selection import train_test_split

188

+

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder

189

+

from sklearn.compose import ColumnTransformer

190

+

from sklearn.impute import SimpleImputer

191

+

192

+

df = pd.read_csv("customer.csv")

193

+

print(df)

194

+

195

+

x=df.iloc[:,:4]

196

+

y=df.iloc[:,-1]

197

+

198

+

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1)

199

+

200

+

trans = ColumnTransformer(

201

+

transformers=[

202

+

('impute_age', SimpleImputer(), ['age']),

203

+

('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']),

204

+

('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']),

205

+

('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education'])

206

+

],

207

+

remainder='passthrough'

208

+

)

209

+

210

+

x_train = trans.fit_transform(x_train)

211

+

x_test =trans.fit_transform(x_test)

212

+

print("\nTransformed XTrain:\n", x_train)

213

+

print("\nTransformed XTest:\n", x_test)

214

+

215

+

le=LabelEncoder()

216

+

y_train1=le.fit_transform(y_train)

217

+

y_test1=le.fit_transform(y_test)

218

+

print("\nTransformed YTrain:\n", y_train1)

219

+

print("\nTransformed YTest:\n", y_test1)

220

+

221

+

OUTPUT:

222

+

age Gender review education Purchase

223

+

0 NaN Male Good HSC yes

224

+

1 48.0 Male Good PG no

225

+

2 68.0 Female Average UG no

226

+

3 77.0 Female Average PG yes

227

+

4 26.0 Male Poor PG yes

228

+

...

229

+

[14 rows x 5 columns]

230

+

231

+

Transformed XTrain:

232

+

[[55. 0. 1. 0. 2.]

233

+

[18. 0. 1. 2. 1.]

234

+

[44. 0. 1. 2. 1.]

235

+

[50. 1. 0. 2. 1.]

236

+

...

237

+

[26. 0. 1. 0. 2.]]

238

+

239

+

Transformed XTest:

240

+

[[77. 1. 2. 0.]

241

+

[77. 1. 1. 2.]]

242

+

243

+

Transformed YTrain:

244

+

[1 0 1 1 1 0 1 0 0 0 1 1]

245

+

246

+

Transformed YTest:

247

+

[0 0]

248

+

249

+

------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) -------

250

+

251

+

CODE:

252

+

import pandas as pd

253

+

254

+

data = {

255

+

'sqft': [1500, 1600, 1700, 1800, 1900],

256

+

'rooms': [3, 3, 4, 4, 5],

257

+

'bathrooms': [1, 2, 2, 2, 3],

258

+

'roof_color': [1, 2, 1, 2, 1],

259

+

'price': [300000, 320000, 340000, 360000, 380000]

260

+

}

261

+

df = pd.DataFrame(data)

262

+

feature_corr = df.drop(columns='price').corr()

263

+

print("Correlation between features:")

264

+

print(feature_corr.round(2))

265

+

266

+

OUTPUT:

267

+

Correlation between features:

268

+

sqft rooms bathrooms roof_color

269

+

sqft 1.00 0.94 0.89 0.00

270

+

rooms 0.94 1.00 0.85 -0.33

271

+

bathrooms 0.89 0.85 1.00 0.00

272

+

roof_color 0.00 -0.33 0.00 1.00

273

+

274

+

275

+

===============================================================================

276

+

PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA)

277

+

(Dimensionality Reduction while retaining maximum variance)

278

+

===============================================================================

279

+

280

+

CODE:

281

+

import pandas as pd

282

+

import numpy as np

283

+

from sklearn.preprocessing import StandardScaler

284

+

285

+

df=pd.read_csv("student_dataset.csv")

286

+

print(df)

287

+

288

+

scaler=StandardScaler()

289

+

df1=scaler.fit_transform(df.iloc[:,:3])

290

+

print(df1)

291

+

292

+

cov_matrix = np.cov(df1.T)

293

+

print("COVARIANCE MATRIX:\n", cov_matrix)

294

+

295

+

eig_val,eig_vect=np.linalg.eig(cov_matrix)

296

+

print("\nEigen Values\n",eig_val)

297

+

print("Eigen Vectors\n",eig_vect)

298

+

299

+

pc = eig_vect[:,[0, 2]]

300

+

pc=pc.T

301

+

print("\nTop 2 Principal Components:\n", pc)

302

+

303

+

trans_df = np.dot(df1[:,0:3], pc.T)

304

+

print(" \nNew Transform\n",trans_df)

305

+

306

+

Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2'])

307

+

Dataf['GTU Marks']=df['GTU'].values

308

+

print(Dataf)

309

+

310

+

OUTPUT:

311

+

Mid_Sem IQ HSC GTU

312

+

0 35 110 78 70

313

+

1 42 125 85 88

314

+

2 28 100 72 65

315

+

3 45 130 90 92

316

+

4 38 115 80 78

317

+

...

318

+

[15 rows x 4 columns]

319

+

320

+

[[-0.09736702 -0.20785572 -0.20441405]

321

+

[ 1.03858157 1.20934235 0.81765621]

322

+

[-1.23331562 -1.15265443 -1.08047428]

323

+

[ 1.52541669 1.68174171 1.5477064 ]

324

+

...

325

+

[-0.74648051 -0.96369469 -0.93446424]]

326

+

327

+

COVARIANCE MATRIX:

328

+

[[1.07142857 1.0614152 1.05676449]

329

+

[1.0614152 1.07142857 1.05019437]

330

+

[1.05676449 1.05019437 1.07142857]]

331

+

332

+

Eigen Values

333

+

[3.18368463 0.00878971 0.02181137]

334

+

335

+

Eigen Vectors

336

+

[[-0.57842869 -0.7974863 -0.17156877]

337

+

[-0.57723546 0.54876897 -0.60469152]

338

+

[-0.57638483 0.25073535 0.77776109]]

339

+

340

+

Top 2 Principal Components:

341

+

[[-0.57842869 -0.57723546 -0.57638483]

342

+

[-0.17156877 -0.60469152 0.77776109]]

343

+

344

+

New Transform

345

+

[[ 0.29412273 -0.01659157]

346

+

[-1.77010531 -0.27352604]

347

+

[ 2.00150714 0.06824795]

348

+

[-2.74518022 -0.074903 ]

349

+

...

350

+

[ 1.5266755 -0.01597918]]

351

+

352

+

PC1 PC2 GTU Marks

353

+

0 0.294123 -0.016592 70

354

+

1 -1.770105 -0.273526 88

355

+

2 2.001507 0.068248 65

356

+

3 -2.745180 -0.074903 92

357

+

4 -0.428478 -0.158651 78

358

+

...

359

+

[15 rows x 3 columns]

360

+

361

+

362

+

===============================================================================

363

+

PRACTICAL 3: DECISION TREE CLASSIFIER

364

+

(Classification with evaluation using precision, recall, and F1-score)

365

+

===============================================================================

366

+

367

+

CODE:

368

+

import pandas as pd

369

+

from sklearn.metrics import confusion_matrix

370

+

from sklearn.tree import DecisionTreeClassifier

371

+

from sklearn.metrics import accuracy_score

372

+

from sklearn.metrics import classification_report

373

+

374

+

data = pd.read_csv("decesiontree.csv")

375

+

print(data)

376

+

377

+

cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2},

378

+

"Income": {"Low": 0, "Medium": 1, "High" : 2 },

379

+

"Student": {"No": 0, "Yes":1 },

380

+

"Credit Rating": { "Fair": 1, "Excellent" : 2 },

381

+

"Buys-Computer": {"No": 0, "Yes": 1}}

382

+

data.replace(cleanup_nums, inplace = True)

383

+

print(data)

384

+

385

+

predictors = data.iloc[:, 1:5]

386

+

target = data.iloc[:, 5]

387

+

388

+

dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,

389

+

max_depth=3,min_samples_leaf=5)

390

+

391

+

OUTPUT:

392

+

Item no Age Income Student Credit Rating Buys-Computer

393

+

0 1 Youth High No Fair No

394

+

1 2 Youth High No Excellent No

395

+

2 3 Middle High No Fair Yes

396

+

3 4 Senior Medium No Fair Yes

397

+

4 5 Senior Low Yes Fair Yes

398

+

...

399

+

[14 rows x 6 columns]

400

+

401

+

Item no Age Income Student Credit Rating Buys-Computer

402

+

0 1 0 2 0 1 0

403

+

1 2 0 2 0 2 0

404

+

2 3 1 2 0 1 1

405

+

3 4 2 1 0 1 1

406

+

4 5 2 0 1 1 1

407

+

...

408

+

[14 rows x 6 columns]

409

+

410

+

411

+

===============================================================================

412

+

PRACTICAL 4: NAIVE BAYES CLASSIFIER

413

+

(Probabilistic classification using Gaussian Naive Bayes)

414

+

===============================================================================

415

+

416

+

CODE:

417

+

import pandas as pd

418

+

from sklearn import preprocessing

419

+

from sklearn.naive_bayes import GaussianNB

420

+

421

+

fl = "Naive_Bayesian.csv"

422

+

df = pd.read_csv(fl, index_col = "Item no")

423

+

print (df)

424

+

425

+

dfCol = df.columns

426

+

print ("df columns: ", dfCol)

427

+

ndfCol = df.shape[1]

428

+

ndfRow = df.shape[0]

429

+

430

+

feature = [[]*ndfRow for x in range(ndfCol)]

431

+

for i in range(ndfCol):

432

+

feature[i] = list(df[dfCol[i]])

433

+

print (dfCol[i],":", feature[i])

434

+

435

+

le = preprocessing.LabelEncoder()

436

+

437

+

feature0 = [[]*ndfRow for x in range(ndfCol)]

438

+

for i in range(ndfCol):

439

+

feature0[i] = le.fit_transform(feature[i])

440

+

print(dfCol[i], "encoded:", feature0[i])

441

+

442

+

features = []

443

+

for i in range(ndfRow):

444

+

xlst = []

445

+

for j in range(ndfCol-1):

446

+

xlst.append(feature0[j][i])

447

+

xtup = tuple(xlst)

448

+

features.append(xtup)

449

+

450

+

print ("features:", features)

451

+

452

+

label = feature0[:][ndfCol-1]

453

+

label = [label[i]+1 for i in range(ndfRow)]

454

+

print ("label:", label)

455

+

456

+

model = GaussianNB()

457

+

model.fit(features, label)

458

+

print ("model:", model)

459

+

460

+

ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ")

461

+

ptLst = [int(x) for x in ptStr.split(',')]

462

+

point1 = [ptLst]

463

+

print ("Unknown data (sample):", point1)

464

+

predicted= model.predict(point1)

465

+

print ("Class for Point:", point1, "is:", predicted)

466

+

467

+

OUTPUT (with input: 0,1,1,0):

468

+

Age Income Student Credit Rating Buys-Computer

469

+

Item no

470

+

1 Youth High No Fair No

471

+

2 Youth High No Excellent No

472

+

3 Middle High No Fair Yes

473

+

4 Senior Medium No Fair Yes

474

+

...

475

+

[14 rows x 5 columns]

476

+

477

+

df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object')

478

+

479

+

Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...]

480

+

Income : ['High', 'High', 'High', 'Medium', 'Low', ...]

481

+

Student : ['No', 'No', 'No', 'No', 'Yes', ...]

482

+

Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...]

483

+

Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...]

484

+

485

+

Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1]

486

+

Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2]

487

+

Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1]

488

+

Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0]

489

+

Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]

490

+

491

+

features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1),

492

+

(1, 1, 1, 1), (0, 1, 1, 0), ...]

493

+

494

+

label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1]

495

+

496

+

model: GaussianNB()

497

+

498

+

Enter unknown data (separated by ,) excluding Index Column:

499

+

Unknown data (sample): [[0, 1, 1, 0]]

500

+

Class for Point: [[0, 1, 1, 0]] is: [2]

501

+

502

+

503

+

===============================================================================

504

+

PRACTICAL 5: LINEAR REGRESSION

505

+

(Predicting continuous values with evaluation using MSE and R² score)

506

+

===============================================================================

507

+

508

+

CODE:

509

+

import pandas as pd

510

+

import matplotlib.pyplot as plt

511

+

from sklearn.linear_model import LinearRegression

512

+

from sklearn.model_selection import train_test_split

513

+

import numpy as np

514

+

from sklearn import metrics

515

+

516

+

dataset=pd.read_csv("LinearRegression.csv")

517

+

print(dataset)

518

+

519

+

x=dataset.iloc[:,0:1]

520

+

y=dataset.iloc[:,1]

521

+

y=y.replace(['Yes','No'],[1,0])

522

+

523

+

print(y)

524

+

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123)

525

+

526

+

model = LinearRegression()

527

+

model = model.fit(X_train, y_train)

528

+

y_pred = model.predict(X_test)

529

+

y_pred_val=model.predict([[18]])

530

+

print(y_pred_val)

531

+

532

+

if(y_pred_val > 0.5):

533

+

print("Yes")

534

+

else:

535

+

print("No")

536

+

537

+

plt.scatter(X_train,y_train, color = 'red')

538

+

plt.plot(X_train, model.predict(X_train))

539

+

540

+

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))

541

+

print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))

542

+

print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

543

+

544

+

OUTPUT:

545

+

Outside Temperature \nCelcius Wear a\n jacket

546

+

0 30 No

547

+

1 25 No

548

+

2 20 No

549

+

3 15 Yes

550

+

4 10 Yes

551

+

552

+

0 0

553

+

1 0

554

+

2 0

555

+

3 1

556

+

4 1

557

+

Name: Wear a\n jacket, dtype: int64

558

+

559

+

[0.54285714]

560

+

Yes

561

+

562

+

Mean Absolute Error: 0.14285714285714302

563

+

Mean Squared Error: 0.02040816326530617

564

+

Root Mean Squared Error: 0.14285714285714302

565

+

566

+

567

+

===============================================================================

568

+

PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER

569

+

(Classification using different k values with accuracy evaluation)

570

+

===============================================================================

571

+

572

+

CODE:

573

+

import pandas as pd

574

+

from sklearn.model_selection import train_test_split

575

+

from sklearn.preprocessing import StandardScaler

576

+

from sklearn.neighbors import KNeighborsClassifier

577

+

from sklearn.metrics import accuracy_score, classification_report

578

+

579

+

df = pd.read_csv("knn.csv")

580

+

df = df[df['Item no.'].notna()]

581

+

print("Dataset Preview:")

582

+

print(df.head())

583

+

584

+

X = df.iloc[:, 1:4]

585

+

y = df.iloc[:, 4]

586

+

print("INPUT\n",X)

587

+

print("OUTPUT\n",y)

588

+

589

+

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

590

+

591

+

scaler = StandardScaler()

592

+

X_train = scaler.fit_transform(X_train)

593

+

X_test = scaler.transform(X_test)

594

+

print("XTRAIN\n",X_train)

595

+

print("X_TEST\n",X_test)

596

+

597

+

knn = KNeighborsClassifier(n_neighbors=3)

598

+

knn.fit(X_train, y_train)

599

+

600

+

y_pred = knn.predict(X_test)

601

+

print("PREDICTION : \n",y_pred)

602

+

603

+

print("\nAccuracy:", accuracy_score(y_test, y_pred))

604

+

print("\nClassification Report:")

605

+

print(classification_report(y_test, y_pred))

606

+

607

+

OUTPUT:

608

+

Dataset Preview:

609

+

Item no. Temp Humidity Wind Speed Play ...

610

+

0 1.0 85.0 85.0 12.0 No ...

611

+

1 2.0 80.0 90.0 9.0 No ...

612

+

2 3.0 83.0 86.0 4.0 Yes ...

613

+

3 4.0 70.0 96.0 3.0 Yes ...

614

+

4 5.0 68.0 80.0 5.0 Yes ...

615

+

616

+

INPUT

617

+

Temp Humidity Wind Speed

618

+

0 85.0 85.0 12.0

619

+

1 80.0 90.0 9.0

620

+

2 83.0 86.0 4.0

621

+

3 70.0 96.0 3.0

622

+

4 68.0 80.0 5.0

623

+

...

624

+

[14 rows x 3 columns]

625

+

626

+

OUTPUT

627

+

0 No

628

+

1 No

629

+

2 Yes

630

+

3 Yes

631

+

4 Yes

632

+

...

633

+

Name: Play, dtype: object

634

+

635

+

XTRAIN

636

+

[[ 1.37690922 -0.53048047 -0.46006855]

637

+

[-1.22885447 -0.99359834 2.25104967]

638

+

[-0.57741354 -0.99359834 -0.46006855]

639

+

[ 1.70262968 0.48837885 -0.64080976]

640

+

...

641

+

[-1.3917147 -1.45671621 -1.00229219]]

642

+

643

+

X_TEST

644

+

[[ 0.39974784 -0.0673626 -1.00229219]

645

+

[-0.08883285 0.85887314 -0.64080976]

646

+

[ 2.02835014 0.39575527 0.80511996]]

647

+

648

+

PREDICTION :

649

+

['Yes' 'Yes' 'Yes']

650

+

651

+

Accuracy: 0.6666666666666666

652

+

653

+

Classification Report:

654

+

precision recall f1-score support

655

+

656

+

No 0.00 0.00 0.00 1

657

+

Yes 0.67 1.00 0.80 2

658

+

659

+

accuracy 0.67 3

660

+

macro avg 0.33 0.50 0.40 3

661

+

weighted avg 0.44 0.67 0.53 3

662

+

663

+

664

+

===============================================================================

665

+

PRACTICAL 7: MULTIPLE LINEAR REGRESSION

666

+

(Prediction using multiple features with R² score and RMSE evaluation)

667

+

===============================================================================

668

+

669

+

CODE:

670

+

import pandas as pd

671

+

import numpy as np

672

+

import matplotlib.pyplot as plt

673

+

from sklearn.linear_model import LinearRegression

674

+

from sklearn.model_selection import train_test_split

675

+

from sklearn.metrics import r2_score

676

+

677

+

Data=pd.read_excel("student_data1.xlsx")

678

+

print(Data)

679

+

680

+

X=Data.iloc[:,:2]

681

+

y=Data.iloc[:,-1:]

682

+

print(X)

683

+

print(y)

684

+

685

+

X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42)

686

+

687

+

print("Xtrain\n",X_train)

688

+

print("XTEST\n",y_test)

689

+

690

+

model = LinearRegression()

691

+

model.fit(X_train.to_numpy(), y_train)

692

+

693

+

y_pred = model.predict([[8.6,125]])

694

+

print("model prediction on Ytest:\n",y_pred.round(2))

695

+

696

+

print("M= ",model.coef_.round(2))

697

+

print("b= ",model.intercept_.round(2))

698

+

699

+

OUTPUT:

700

+

CGPA IQ Placement (LPA)

701

+

0 7.5 110 6.5

702

+

1 8.0 120 7.0

703

+

2 8.5 125 8.2

704

+

3 9.0 130 9.1

705

+

4 6.5 100 5.0

706

+

...

707

+

[10 rows x 3 columns]

708

+

709

+

CGPA IQ

710

+

0 7.5 110

711

+

1 8.0 120

712

+

2 8.5 125

713

+

3 9.0 130

714

+

4 6.5 100

715

+

...

716

+

[10 rows x 2 columns]

717

+

718

+

Placement (LPA)

719

+

0 6.5

720

+

1 7.0

721

+

2 8.2

722

+

3 9.1

723

+

4 5.0

724

+

...

725

+

[10 rows x 1 columns]

726

+

727

+

Xtrain

728

+

CGPA IQ

729

+

5 7.0 105

730

+

0 7.5 110

731

+

7 8.8 128

732

+

2 8.5 125

733

+

...

734

+

[8 rows x 2 columns]

735

+

736

+

XTEST

737

+

Placement (LPA)

738

+

8 5.2

739

+

1 7.0

740

+

741

+

model prediction on Ytest:

742

+

[[8.45]]

743

+

744

+

M= [[1.32 0.03]]

745

+

b= [-6.51]

746

+

747

+

748

+

===============================================================================

749

+

PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD)

750

+

(Dimensionality Reduction using SVD - Manual & Sklearn Implementation)

751

+

===============================================================================

752

+

753

+

------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) -------

754

+

755

+

CODE:

756

+

import pandas as pd

757

+

import numpy as np

758

+

759

+

df = pd.read_excel("student_dataset.xlsx")

760

+

A = df.iloc[:, :3].to_numpy()

761

+

A_mean = A - np.mean(A, axis=0)

762

+

763

+

U, X, V_T = np.linalg.svd(A_mean)

764

+

k = 2

765

+

U_k = U[:, :k]

766

+

S_k = np.diag(X[:k])

767

+

768

+

final_data1 = np.dot(U_k, S_k)

769

+

print("Reduced Data:\n", final_data1)

770

+

771

+

explained_variance = (X[:k]**2) / np.sum(X**2)

772

+

print("Explained variance by top 2 components:", explained_variance)

773

+

774

+

reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"])

775

+

reduced_df['GTU'] = df['GTU'].values

776

+

print(reduced_df)

777

+

778

+

OUTPUT:

779

+

Reduced Data:

780

+

[[ -2.60622042 0.08983428]

781

+

[ 15.20533711 -2.22651162]

782

+

[-16.15266994 0.28962606]

783

+

[ 22.72992624 -0.77843972]

784

+

[ 3.46193108 -0.87192496]

785

+

...

786

+

[-12.83777493 0.27648847]]

787

+

788

+

Explained variance by top 2 components: [0.99132896 0.00672569]

789

+

790

+

PC1 PC2 GTU

791

+

0 -2.606220 0.089834 70

792

+

1 15.205337 -2.226512 88

793

+

2 -16.152670 0.289626 65

794

+

3 22.729926 -0.778440 92

795

+

4 3.461931 -0.871925 78

796

+

...

797

+

[15 rows x 3 columns]

798

+

799

+

------- 8.2: SVD USING SKLEARN (svd2.py) -------

800

+

801

+

CODE:

802

+

import pandas as pd

803

+

import numpy as np

804

+

from sklearn.decomposition import TruncatedSVD

805

+

from sklearn.preprocessing import StandardScaler

806

+

807

+

df = pd.read_excel("student_dataset.xlsx")

808

+

X = df.iloc[:, :3]

809

+

810

+

scaler = StandardScaler()

811

+

X_scaled = scaler.fit_transform(X)

812

+

813

+

svd = TruncatedSVD(n_components=2)

814

+

X_reduced = svd.fit_transform(X_scaled)

815

+

print(X_reduced)

816

+

817

+

Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2'])

818

+

Dataf['GTU Marks']=df['GTU'].values

819

+

print(Dataf)

820

+

821

+

print("Singular values:", svd.singular_values_)

822

+

print("Explained variance:", svd.explained_variance_)

823

+

print("Explained variance ratio:", svd.explained_variance_ratio_)

824

+

print("Total variance captured:", svd.explained_variance_ratio_.sum())

825

+

826

+

OUTPUT:

827

+

[[-0.29412273 -0.01659157]

828

+

[ 1.77010531 -0.27352604]

829

+

[-2.00150714 0.06824795]

830

+

[ 2.74518022 -0.074903 ]

831

+

[ 0.42847827 -0.1586513 ]

832

+

...

833

+

[-1.5266755 -0.01597918]]

834

+

835

+

PC1 PC2 GTU Marks

836

+

0 -0.294123 -0.016592 70

837

+

1 1.770105 -0.273526 88

838

+

2 -2.001507 0.068248 65

839

+

3 2.745180 -0.074903 92

840

+

4 0.428478 -0.158651 78

841

+

...

842

+

[15 rows x 3 columns]

843

+

844

+

Singular values: [6.67619539 0.55259316]

845

+

Explained variance: [2.97143899 0.02035728]

846

+

Explained variance ratio: [0.99047966 0.00678576]

847

+

Total variance captured: 0.997265423131314

848

+

849

+

850

+

===============================================================================

851

+

END OF JOURNAL

852

+

===============================================================================

Aunali / practicals.txt

Aunali revidoval tento gist 5 months ago. Přejít na revizi