practicals.txt
· 22 KiB · Text
Ham
===============================================================================
MACHINE LEARNING PRACTICALS - JOURNAL
===============================================================================
===============================================================================
PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING
(Handling missing values, Encoding categorical variables, Scaling features)
===============================================================================
------- 1.1: ENCODING (encoding.py) -------
CODE:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
df=pd.read_csv("customer.csv")
print(df)
df1=df.iloc[:,2:]
print(df1)
x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1)
print("XTrain: \n",x_train)
print("Ytrain: \n",y_train)
print("XTEST: \n",x_test)
print("YTEST: \n",y_test)
#ordinal Encoding
oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']])
oe.fit(x_train)
x_train=oe.transform(x_train)
x_test=oe.transform(x_test)
print(x_train)
#Label Encoder
le=LabelEncoder()
le.fit(y_train)
y_train=le.transform(y_train)
y_test=le.transform(y_test)
print(y_train)
#Onehot Encoding using pandas
df2=df.iloc[:,1:2]
encod=OneHotEncoder(sparse_output=False)
encoded=encod.fit_transform(df2)
print("Feature Names:")
print(encod.get_feature_names_out())
print(encoded)
OUTPUT:
age Gender review education Purchase
0 NaN Male Good HSC yes
1 48.0 Male Good PG no
2 68.0 Female Average UG no
3 77.0 Female Average PG yes
4 26.0 Male Poor PG yes
...
[14 rows x 5 columns]
review education Purchase
0 Good HSC yes
1 Good PG no
2 Average UG no
3 Average PG yes
...
[14 rows x 3 columns]
XTrain:
review education
3 Average PG
8 Good UG
6 Good PG
2 Average UG
...
[12 rows x 2 columns]
Ytrain:
3 yes
8 yes
6 yes
2 no
...
Name: Purchase, dtype: object
XTEST:
review education
5 Good UG
13 Good UG
YTEST:
5 no
13 yes
Name: Purchase, dtype: object
[[1. 2.]
[2. 1.]
[2. 2.]
[1. 1.]
...
[0. 2.]]
[1 1 1 0 0 0 0 1 1 1 1 1]
Feature Names:
['Gender_Female' 'Gender_Male']
[[0. 1.]
[0. 1.]
[1. 0.]
[1. 0.]
...
[1. 0.]]
------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) -------
CODE:
import matplotlib.pyplot as plt
import numpy as np
arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110])
arr1=np.sort(arr)
print(arr1)
mean=np.mean(arr)
print("MEAN=",mean)
median=np.median(arr)
print("MEDIAN=",median)
q1=np.percentile(arr,25)
print("Quarter 1=",q1)
q3=np.percentile(arr1,75)
print("Quarter 3=",q3)
plt.boxplot(arr)
plt.show()
plt.hist(arr)
plt.show()
OUTPUT:
[100 110 110 110 120 120 130 140 140 150 170 220]
MEAN= 135.0
MEDIAN= 125.0
Quarter 1= 110.0
Quarter 3= 142.5
------- 1.3: CORRELATION WITH TARGET (corela_target.py) -------
CODE:
import pandas as pd
data = {
'sqft': [1500, 1600, 1700, 1800, 1900],
'rooms': [3, 3, 4, 4, 5],
'roof_color': [1, 2, 1, 2, 1],
'price': [300000, 320000, 340000, 360000, 380000]
}
df = pd.DataFrame(data)
correlation_matrix = df.corr(numeric_only=True)
print("🔁 Full Correlation Matrix:")
print(correlation_matrix.round(2))
correlation = df.corr()['price'].drop('price')
print(correlation)
selected_features = correlation[correlation.abs() > 0.3].index
print("Selected features:", list(selected_features))
OUTPUT:
🔁 Full Correlation Matrix:
sqft rooms roof_color price
sqft 1.00 0.94 0.00 1.00
rooms 0.94 1.00 -0.33 0.94
roof_color 0.00 -0.33 1.00 0.00
price 1.00 0.94 0.00 1.00
sqft 1.000000e+00
rooms 9.449112e-01
roof_color 5.250970e-17
Name: price, dtype: float64
Selected features: ['sqft', 'rooms']
------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) -------
CODE:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
df = pd.read_csv("customer.csv")
print(df)
x=df.iloc[:,:4]
y=df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1)
trans = ColumnTransformer(
transformers=[
('impute_age', SimpleImputer(), ['age']),
('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']),
('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']),
('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education'])
],
remainder='passthrough'
)
x_train = trans.fit_transform(x_train)
x_test =trans.fit_transform(x_test)
print("\nTransformed XTrain:\n", x_train)
print("\nTransformed XTest:\n", x_test)
le=LabelEncoder()
y_train1=le.fit_transform(y_train)
y_test1=le.fit_transform(y_test)
print("\nTransformed YTrain:\n", y_train1)
print("\nTransformed YTest:\n", y_test1)
OUTPUT:
age Gender review education Purchase
0 NaN Male Good HSC yes
1 48.0 Male Good PG no
2 68.0 Female Average UG no
3 77.0 Female Average PG yes
4 26.0 Male Poor PG yes
...
[14 rows x 5 columns]
Transformed XTrain:
[[55. 0. 1. 0. 2.]
[18. 0. 1. 2. 1.]
[44. 0. 1. 2. 1.]
[50. 1. 0. 2. 1.]
...
[26. 0. 1. 0. 2.]]
Transformed XTest:
[[77. 1. 2. 0.]
[77. 1. 1. 2.]]
Transformed YTrain:
[1 0 1 1 1 0 1 0 0 0 1 1]
Transformed YTest:
[0 0]
------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) -------
CODE:
import pandas as pd
data = {
'sqft': [1500, 1600, 1700, 1800, 1900],
'rooms': [3, 3, 4, 4, 5],
'bathrooms': [1, 2, 2, 2, 3],
'roof_color': [1, 2, 1, 2, 1],
'price': [300000, 320000, 340000, 360000, 380000]
}
df = pd.DataFrame(data)
feature_corr = df.drop(columns='price').corr()
print("Correlation between features:")
print(feature_corr.round(2))
OUTPUT:
Correlation between features:
sqft rooms bathrooms roof_color
sqft 1.00 0.94 0.89 0.00
rooms 0.94 1.00 0.85 -0.33
bathrooms 0.89 0.85 1.00 0.00
roof_color 0.00 -0.33 0.00 1.00
===============================================================================
PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA)
(Dimensionality Reduction while retaining maximum variance)
===============================================================================
CODE:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
df=pd.read_csv("student_dataset.csv")
print(df)
scaler=StandardScaler()
df1=scaler.fit_transform(df.iloc[:,:3])
print(df1)
cov_matrix = np.cov(df1.T)
print("COVARIANCE MATRIX:\n", cov_matrix)
eig_val,eig_vect=np.linalg.eig(cov_matrix)
print("\nEigen Values\n",eig_val)
print("Eigen Vectors\n",eig_vect)
pc = eig_vect[:,[0, 2]]
pc=pc.T
print("\nTop 2 Principal Components:\n", pc)
trans_df = np.dot(df1[:,0:3], pc.T)
print(" \nNew Transform\n",trans_df)
Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2'])
Dataf['GTU Marks']=df['GTU'].values
print(Dataf)
OUTPUT:
Mid_Sem IQ HSC GTU
0 35 110 78 70
1 42 125 85 88
2 28 100 72 65
3 45 130 90 92
4 38 115 80 78
...
[15 rows x 4 columns]
[[-0.09736702 -0.20785572 -0.20441405]
[ 1.03858157 1.20934235 0.81765621]
[-1.23331562 -1.15265443 -1.08047428]
[ 1.52541669 1.68174171 1.5477064 ]
...
[-0.74648051 -0.96369469 -0.93446424]]
COVARIANCE MATRIX:
[[1.07142857 1.0614152 1.05676449]
[1.0614152 1.07142857 1.05019437]
[1.05676449 1.05019437 1.07142857]]
Eigen Values
[3.18368463 0.00878971 0.02181137]
Eigen Vectors
[[-0.57842869 -0.7974863 -0.17156877]
[-0.57723546 0.54876897 -0.60469152]
[-0.57638483 0.25073535 0.77776109]]
Top 2 Principal Components:
[[-0.57842869 -0.57723546 -0.57638483]
[-0.17156877 -0.60469152 0.77776109]]
New Transform
[[ 0.29412273 -0.01659157]
[-1.77010531 -0.27352604]
[ 2.00150714 0.06824795]
[-2.74518022 -0.074903 ]
...
[ 1.5266755 -0.01597918]]
PC1 PC2 GTU Marks
0 0.294123 -0.016592 70
1 -1.770105 -0.273526 88
2 2.001507 0.068248 65
3 -2.745180 -0.074903 92
4 -0.428478 -0.158651 78
...
[15 rows x 3 columns]
===============================================================================
PRACTICAL 3: DECISION TREE CLASSIFIER
(Classification with evaluation using precision, recall, and F1-score)
===============================================================================
CODE:
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
data = pd.read_csv("decesiontree.csv")
print(data)
cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2},
"Income": {"Low": 0, "Medium": 1, "High" : 2 },
"Student": {"No": 0, "Yes":1 },
"Credit Rating": { "Fair": 1, "Excellent" : 2 },
"Buys-Computer": {"No": 0, "Yes": 1}}
data.replace(cleanup_nums, inplace = True)
print(data)
predictors = data.iloc[:, 1:5]
target = data.iloc[:, 5]
dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100,
max_depth=3,min_samples_leaf=5)
OUTPUT:
Item no Age Income Student Credit Rating Buys-Computer
0 1 Youth High No Fair No
1 2 Youth High No Excellent No
2 3 Middle High No Fair Yes
3 4 Senior Medium No Fair Yes
4 5 Senior Low Yes Fair Yes
...
[14 rows x 6 columns]
Item no Age Income Student Credit Rating Buys-Computer
0 1 0 2 0 1 0
1 2 0 2 0 2 0
2 3 1 2 0 1 1
3 4 2 1 0 1 1
4 5 2 0 1 1 1
...
[14 rows x 6 columns]
===============================================================================
PRACTICAL 4: NAIVE BAYES CLASSIFIER
(Probabilistic classification using Gaussian Naive Bayes)
===============================================================================
CODE:
import pandas as pd
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
fl = "Naive_Bayesian.csv"
df = pd.read_csv(fl, index_col = "Item no")
print (df)
dfCol = df.columns
print ("df columns: ", dfCol)
ndfCol = df.shape[1]
ndfRow = df.shape[0]
feature = [[]*ndfRow for x in range(ndfCol)]
for i in range(ndfCol):
feature[i] = list(df[dfCol[i]])
print (dfCol[i],":", feature[i])
le = preprocessing.LabelEncoder()
feature0 = [[]*ndfRow for x in range(ndfCol)]
for i in range(ndfCol):
feature0[i] = le.fit_transform(feature[i])
print(dfCol[i], "encoded:", feature0[i])
features = []
for i in range(ndfRow):
xlst = []
for j in range(ndfCol-1):
xlst.append(feature0[j][i])
xtup = tuple(xlst)
features.append(xtup)
print ("features:", features)
label = feature0[:][ndfCol-1]
label = [label[i]+1 for i in range(ndfRow)]
print ("label:", label)
model = GaussianNB()
model.fit(features, label)
print ("model:", model)
ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ")
ptLst = [int(x) for x in ptStr.split(',')]
point1 = [ptLst]
print ("Unknown data (sample):", point1)
predicted= model.predict(point1)
print ("Class for Point:", point1, "is:", predicted)
OUTPUT (with input: 0,1,1,0):
Age Income Student Credit Rating Buys-Computer
Item no
1 Youth High No Fair No
2 Youth High No Excellent No
3 Middle High No Fair Yes
4 Senior Medium No Fair Yes
...
[14 rows x 5 columns]
df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object')
Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...]
Income : ['High', 'High', 'High', 'Medium', 'Low', ...]
Student : ['No', 'No', 'No', 'No', 'Yes', ...]
Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...]
Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...]
Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1]
Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2]
Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1]
Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0]
Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0]
features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1),
(1, 1, 1, 1), (0, 1, 1, 0), ...]
label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1]
model: GaussianNB()
Enter unknown data (separated by ,) excluding Index Column:
Unknown data (sample): [[0, 1, 1, 0]]
Class for Point: [[0, 1, 1, 0]] is: [2]
===============================================================================
PRACTICAL 5: LINEAR REGRESSION
(Predicting continuous values with evaluation using MSE and R² score)
===============================================================================
CODE:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import metrics
dataset=pd.read_csv("LinearRegression.csv")
print(dataset)
x=dataset.iloc[:,0:1]
y=dataset.iloc[:,1]
y=y.replace(['Yes','No'],[1,0])
print(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123)
model = LinearRegression()
model = model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_val=model.predict([[18]])
print(y_pred_val)
if(y_pred_val > 0.5):
print("Yes")
else:
print("No")
plt.scatter(X_train,y_train, color = 'red')
plt.plot(X_train, model.predict(X_train))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
OUTPUT:
Outside Temperature \nCelcius Wear a\n jacket
0 30 No
1 25 No
2 20 No
3 15 Yes
4 10 Yes
0 0
1 0
2 0
3 1
4 1
Name: Wear a\n jacket, dtype: int64
[0.54285714]
Yes
Mean Absolute Error: 0.14285714285714302
Mean Squared Error: 0.02040816326530617
Root Mean Squared Error: 0.14285714285714302
===============================================================================
PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER
(Classification using different k values with accuracy evaluation)
===============================================================================
CODE:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
df = pd.read_csv("knn.csv")
df = df[df['Item no.'].notna()]
print("Dataset Preview:")
print(df.head())
X = df.iloc[:, 1:4]
y = df.iloc[:, 4]
print("INPUT\n",X)
print("OUTPUT\n",y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print("XTRAIN\n",X_train)
print("X_TEST\n",X_test)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("PREDICTION : \n",y_pred)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
OUTPUT:
Dataset Preview:
Item no. Temp Humidity Wind Speed Play ...
0 1.0 85.0 85.0 12.0 No ...
1 2.0 80.0 90.0 9.0 No ...
2 3.0 83.0 86.0 4.0 Yes ...
3 4.0 70.0 96.0 3.0 Yes ...
4 5.0 68.0 80.0 5.0 Yes ...
INPUT
Temp Humidity Wind Speed
0 85.0 85.0 12.0
1 80.0 90.0 9.0
2 83.0 86.0 4.0
3 70.0 96.0 3.0
4 68.0 80.0 5.0
...
[14 rows x 3 columns]
OUTPUT
0 No
1 No
2 Yes
3 Yes
4 Yes
...
Name: Play, dtype: object
XTRAIN
[[ 1.37690922 -0.53048047 -0.46006855]
[-1.22885447 -0.99359834 2.25104967]
[-0.57741354 -0.99359834 -0.46006855]
[ 1.70262968 0.48837885 -0.64080976]
...
[-1.3917147 -1.45671621 -1.00229219]]
X_TEST
[[ 0.39974784 -0.0673626 -1.00229219]
[-0.08883285 0.85887314 -0.64080976]
[ 2.02835014 0.39575527 0.80511996]]
PREDICTION :
['Yes' 'Yes' 'Yes']
Accuracy: 0.6666666666666666
Classification Report:
precision recall f1-score support
No 0.00 0.00 0.00 1
Yes 0.67 1.00 0.80 2
accuracy 0.67 3
macro avg 0.33 0.50 0.40 3
weighted avg 0.44 0.67 0.53 3
===============================================================================
PRACTICAL 7: MULTIPLE LINEAR REGRESSION
(Prediction using multiple features with R² score and RMSE evaluation)
===============================================================================
CODE:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
Data=pd.read_excel("student_data1.xlsx")
print(Data)
X=Data.iloc[:,:2]
y=Data.iloc[:,-1:]
print(X)
print(y)
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42)
print("Xtrain\n",X_train)
print("XTEST\n",y_test)
model = LinearRegression()
model.fit(X_train.to_numpy(), y_train)
y_pred = model.predict([[8.6,125]])
print("model prediction on Ytest:\n",y_pred.round(2))
print("M= ",model.coef_.round(2))
print("b= ",model.intercept_.round(2))
OUTPUT:
CGPA IQ Placement (LPA)
0 7.5 110 6.5
1 8.0 120 7.0
2 8.5 125 8.2
3 9.0 130 9.1
4 6.5 100 5.0
...
[10 rows x 3 columns]
CGPA IQ
0 7.5 110
1 8.0 120
2 8.5 125
3 9.0 130
4 6.5 100
...
[10 rows x 2 columns]
Placement (LPA)
0 6.5
1 7.0
2 8.2
3 9.1
4 5.0
...
[10 rows x 1 columns]
Xtrain
CGPA IQ
5 7.0 105
0 7.5 110
7 8.8 128
2 8.5 125
...
[8 rows x 2 columns]
XTEST
Placement (LPA)
8 5.2
1 7.0
model prediction on Ytest:
[[8.45]]
M= [[1.32 0.03]]
b= [-6.51]
===============================================================================
PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD)
(Dimensionality Reduction using SVD - Manual & Sklearn Implementation)
===============================================================================
------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) -------
CODE:
import pandas as pd
import numpy as np
df = pd.read_excel("student_dataset.xlsx")
A = df.iloc[:, :3].to_numpy()
A_mean = A - np.mean(A, axis=0)
U, X, V_T = np.linalg.svd(A_mean)
k = 2
U_k = U[:, :k]
S_k = np.diag(X[:k])
final_data1 = np.dot(U_k, S_k)
print("Reduced Data:\n", final_data1)
explained_variance = (X[:k]**2) / np.sum(X**2)
print("Explained variance by top 2 components:", explained_variance)
reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"])
reduced_df['GTU'] = df['GTU'].values
print(reduced_df)
OUTPUT:
Reduced Data:
[[ -2.60622042 0.08983428]
[ 15.20533711 -2.22651162]
[-16.15266994 0.28962606]
[ 22.72992624 -0.77843972]
[ 3.46193108 -0.87192496]
...
[-12.83777493 0.27648847]]
Explained variance by top 2 components: [0.99132896 0.00672569]
PC1 PC2 GTU
0 -2.606220 0.089834 70
1 15.205337 -2.226512 88
2 -16.152670 0.289626 65
3 22.729926 -0.778440 92
4 3.461931 -0.871925 78
...
[15 rows x 3 columns]
------- 8.2: SVD USING SKLEARN (svd2.py) -------
CODE:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
df = pd.read_excel("student_dataset.xlsx")
X = df.iloc[:, :3]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
svd = TruncatedSVD(n_components=2)
X_reduced = svd.fit_transform(X_scaled)
print(X_reduced)
Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2'])
Dataf['GTU Marks']=df['GTU'].values
print(Dataf)
print("Singular values:", svd.singular_values_)
print("Explained variance:", svd.explained_variance_)
print("Explained variance ratio:", svd.explained_variance_ratio_)
print("Total variance captured:", svd.explained_variance_ratio_.sum())
OUTPUT:
[[-0.29412273 -0.01659157]
[ 1.77010531 -0.27352604]
[-2.00150714 0.06824795]
[ 2.74518022 -0.074903 ]
[ 0.42847827 -0.1586513 ]
...
[-1.5266755 -0.01597918]]
PC1 PC2 GTU Marks
0 -0.294123 -0.016592 70
1 1.770105 -0.273526 88
2 -2.001507 0.068248 65
3 2.745180 -0.074903 92
4 0.428478 -0.158651 78
...
[15 rows x 3 columns]
Singular values: [6.67619539 0.55259316]
Explained variance: [2.97143899 0.02035728]
Explained variance ratio: [0.99047966 0.00678576]
Total variance captured: 0.997265423131314
===============================================================================
END OF JOURNAL
===============================================================================
| 1 | =============================================================================== |
| 2 | MACHINE LEARNING PRACTICALS - JOURNAL |
| 3 | =============================================================================== |
| 4 | |
| 5 | =============================================================================== |
| 6 | PRACTICAL 1: FEATURE ENGINEERING AND DATA PREPROCESSING |
| 7 | (Handling missing values, Encoding categorical variables, Scaling features) |
| 8 | =============================================================================== |
| 9 | |
| 10 | ------- 1.1: ENCODING (encoding.py) ------- |
| 11 | |
| 12 | CODE: |
| 13 | import pandas as pd |
| 14 | from sklearn.preprocessing import OrdinalEncoder |
| 15 | from sklearn.model_selection import train_test_split |
| 16 | from sklearn.preprocessing import LabelEncoder |
| 17 | from sklearn.preprocessing import OneHotEncoder |
| 18 | |
| 19 | df=pd.read_csv("customer.csv") |
| 20 | print(df) |
| 21 | |
| 22 | df1=df.iloc[:,2:] |
| 23 | print(df1) |
| 24 | |
| 25 | x_train,x_test,y_train,y_test=train_test_split(df1.iloc[:,0:2],df1.iloc[:,-1],test_size=0.1) |
| 26 | print("XTrain: \n",x_train) |
| 27 | print("Ytrain: \n",y_train) |
| 28 | print("XTEST: \n",x_test) |
| 29 | print("YTEST: \n",y_test) |
| 30 | |
| 31 | #ordinal Encoding |
| 32 | oe=OrdinalEncoder(categories=[['Poor','Average','Good'],['HSC','UG','PG']]) |
| 33 | oe.fit(x_train) |
| 34 | x_train=oe.transform(x_train) |
| 35 | x_test=oe.transform(x_test) |
| 36 | print(x_train) |
| 37 | |
| 38 | #Label Encoder |
| 39 | le=LabelEncoder() |
| 40 | le.fit(y_train) |
| 41 | y_train=le.transform(y_train) |
| 42 | y_test=le.transform(y_test) |
| 43 | print(y_train) |
| 44 | |
| 45 | #Onehot Encoding using pandas |
| 46 | df2=df.iloc[:,1:2] |
| 47 | encod=OneHotEncoder(sparse_output=False) |
| 48 | encoded=encod.fit_transform(df2) |
| 49 | print("Feature Names:") |
| 50 | print(encod.get_feature_names_out()) |
| 51 | print(encoded) |
| 52 | |
| 53 | OUTPUT: |
| 54 | age Gender review education Purchase |
| 55 | 0 NaN Male Good HSC yes |
| 56 | 1 48.0 Male Good PG no |
| 57 | 2 68.0 Female Average UG no |
| 58 | 3 77.0 Female Average PG yes |
| 59 | 4 26.0 Male Poor PG yes |
| 60 | ... |
| 61 | [14 rows x 5 columns] |
| 62 | |
| 63 | review education Purchase |
| 64 | 0 Good HSC yes |
| 65 | 1 Good PG no |
| 66 | 2 Average UG no |
| 67 | 3 Average PG yes |
| 68 | ... |
| 69 | [14 rows x 3 columns] |
| 70 | |
| 71 | XTrain: |
| 72 | review education |
| 73 | 3 Average PG |
| 74 | 8 Good UG |
| 75 | 6 Good PG |
| 76 | 2 Average UG |
| 77 | ... |
| 78 | [12 rows x 2 columns] |
| 79 | |
| 80 | Ytrain: |
| 81 | 3 yes |
| 82 | 8 yes |
| 83 | 6 yes |
| 84 | 2 no |
| 85 | ... |
| 86 | Name: Purchase, dtype: object |
| 87 | |
| 88 | XTEST: |
| 89 | review education |
| 90 | 5 Good UG |
| 91 | 13 Good UG |
| 92 | |
| 93 | YTEST: |
| 94 | 5 no |
| 95 | 13 yes |
| 96 | Name: Purchase, dtype: object |
| 97 | |
| 98 | [[1. 2.] |
| 99 | [2. 1.] |
| 100 | [2. 2.] |
| 101 | [1. 1.] |
| 102 | ... |
| 103 | [0. 2.]] |
| 104 | |
| 105 | [1 1 1 0 0 0 0 1 1 1 1 1] |
| 106 | |
| 107 | Feature Names: |
| 108 | ['Gender_Female' 'Gender_Male'] |
| 109 | |
| 110 | [[0. 1.] |
| 111 | [0. 1.] |
| 112 | [1. 0.] |
| 113 | [1. 0.] |
| 114 | ... |
| 115 | [1. 0.]] |
| 116 | |
| 117 | ------- 1.2: BOXPLOT AND HISTOGRAM (boxplot.py) ------- |
| 118 | |
| 119 | CODE: |
| 120 | import matplotlib.pyplot as plt |
| 121 | import numpy as np |
| 122 | arr=np.array([100,120,110,150,110,140,130,170,120,220,140,110]) |
| 123 | arr1=np.sort(arr) |
| 124 | print(arr1) |
| 125 | mean=np.mean(arr) |
| 126 | print("MEAN=",mean) |
| 127 | median=np.median(arr) |
| 128 | print("MEDIAN=",median) |
| 129 | q1=np.percentile(arr,25) |
| 130 | print("Quarter 1=",q1) |
| 131 | q3=np.percentile(arr1,75) |
| 132 | print("Quarter 3=",q3) |
| 133 | plt.boxplot(arr) |
| 134 | plt.show() |
| 135 | plt.hist(arr) |
| 136 | plt.show() |
| 137 | |
| 138 | OUTPUT: |
| 139 | [100 110 110 110 120 120 130 140 140 150 170 220] |
| 140 | MEAN= 135.0 |
| 141 | MEDIAN= 125.0 |
| 142 | Quarter 1= 110.0 |
| 143 | Quarter 3= 142.5 |
| 144 | |
| 145 | ------- 1.3: CORRELATION WITH TARGET (corela_target.py) ------- |
| 146 | |
| 147 | CODE: |
| 148 | import pandas as pd |
| 149 | |
| 150 | data = { |
| 151 | 'sqft': [1500, 1600, 1700, 1800, 1900], |
| 152 | 'rooms': [3, 3, 4, 4, 5], |
| 153 | 'roof_color': [1, 2, 1, 2, 1], |
| 154 | 'price': [300000, 320000, 340000, 360000, 380000] |
| 155 | } |
| 156 | |
| 157 | df = pd.DataFrame(data) |
| 158 | correlation_matrix = df.corr(numeric_only=True) |
| 159 | print("🔁 Full Correlation Matrix:") |
| 160 | print(correlation_matrix.round(2)) |
| 161 | |
| 162 | correlation = df.corr()['price'].drop('price') |
| 163 | print(correlation) |
| 164 | |
| 165 | selected_features = correlation[correlation.abs() > 0.3].index |
| 166 | print("Selected features:", list(selected_features)) |
| 167 | |
| 168 | OUTPUT: |
| 169 | 🔁 Full Correlation Matrix: |
| 170 | sqft rooms roof_color price |
| 171 | sqft 1.00 0.94 0.00 1.00 |
| 172 | rooms 0.94 1.00 -0.33 0.94 |
| 173 | roof_color 0.00 -0.33 1.00 0.00 |
| 174 | price 1.00 0.94 0.00 1.00 |
| 175 | |
| 176 | sqft 1.000000e+00 |
| 177 | rooms 9.449112e-01 |
| 178 | roof_color 5.250970e-17 |
| 179 | Name: price, dtype: float64 |
| 180 | |
| 181 | Selected features: ['sqft', 'rooms'] |
| 182 | |
| 183 | ------- 1.4: COLUMN TRANSFORMER ENCODING (column_trans_encod.py) ------- |
| 184 | |
| 185 | CODE: |
| 186 | import pandas as pd |
| 187 | from sklearn.model_selection import train_test_split |
| 188 | from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder |
| 189 | from sklearn.compose import ColumnTransformer |
| 190 | from sklearn.impute import SimpleImputer |
| 191 | |
| 192 | df = pd.read_csv("customer.csv") |
| 193 | print(df) |
| 194 | |
| 195 | x=df.iloc[:,:4] |
| 196 | y=df.iloc[:,-1] |
| 197 | |
| 198 | x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.1) |
| 199 | |
| 200 | trans = ColumnTransformer( |
| 201 | transformers=[ |
| 202 | ('impute_age', SimpleImputer(), ['age']), |
| 203 | ('onehot_gender', OneHotEncoder(sparse_output=False), ['Gender']), |
| 204 | ('ordinal_rating', OrdinalEncoder(categories=[['Poor', 'Average', 'Good']]), ['review']), |
| 205 | ('ordinal_education', OrdinalEncoder(categories=[['HSC', 'UG', 'PG']]), ['education']) |
| 206 | ], |
| 207 | remainder='passthrough' |
| 208 | ) |
| 209 | |
| 210 | x_train = trans.fit_transform(x_train) |
| 211 | x_test =trans.fit_transform(x_test) |
| 212 | print("\nTransformed XTrain:\n", x_train) |
| 213 | print("\nTransformed XTest:\n", x_test) |
| 214 | |
| 215 | le=LabelEncoder() |
| 216 | y_train1=le.fit_transform(y_train) |
| 217 | y_test1=le.fit_transform(y_test) |
| 218 | print("\nTransformed YTrain:\n", y_train1) |
| 219 | print("\nTransformed YTest:\n", y_test1) |
| 220 | |
| 221 | OUTPUT: |
| 222 | age Gender review education Purchase |
| 223 | 0 NaN Male Good HSC yes |
| 224 | 1 48.0 Male Good PG no |
| 225 | 2 68.0 Female Average UG no |
| 226 | 3 77.0 Female Average PG yes |
| 227 | 4 26.0 Male Poor PG yes |
| 228 | ... |
| 229 | [14 rows x 5 columns] |
| 230 | |
| 231 | Transformed XTrain: |
| 232 | [[55. 0. 1. 0. 2.] |
| 233 | [18. 0. 1. 2. 1.] |
| 234 | [44. 0. 1. 2. 1.] |
| 235 | [50. 1. 0. 2. 1.] |
| 236 | ... |
| 237 | [26. 0. 1. 0. 2.]] |
| 238 | |
| 239 | Transformed XTest: |
| 240 | [[77. 1. 2. 0.] |
| 241 | [77. 1. 1. 2.]] |
| 242 | |
| 243 | Transformed YTrain: |
| 244 | [1 0 1 1 1 0 1 0 0 0 1 1] |
| 245 | |
| 246 | Transformed YTest: |
| 247 | [0 0] |
| 248 | |
| 249 | ------- 1.5: CORRELATION BETWEEN FEATURES (corel_bt_feat.py) ------- |
| 250 | |
| 251 | CODE: |
| 252 | import pandas as pd |
| 253 | |
| 254 | data = { |
| 255 | 'sqft': [1500, 1600, 1700, 1800, 1900], |
| 256 | 'rooms': [3, 3, 4, 4, 5], |
| 257 | 'bathrooms': [1, 2, 2, 2, 3], |
| 258 | 'roof_color': [1, 2, 1, 2, 1], |
| 259 | 'price': [300000, 320000, 340000, 360000, 380000] |
| 260 | } |
| 261 | df = pd.DataFrame(data) |
| 262 | feature_corr = df.drop(columns='price').corr() |
| 263 | print("Correlation between features:") |
| 264 | print(feature_corr.round(2)) |
| 265 | |
| 266 | OUTPUT: |
| 267 | Correlation between features: |
| 268 | sqft rooms bathrooms roof_color |
| 269 | sqft 1.00 0.94 0.89 0.00 |
| 270 | rooms 0.94 1.00 0.85 -0.33 |
| 271 | bathrooms 0.89 0.85 1.00 0.00 |
| 272 | roof_color 0.00 -0.33 0.00 1.00 |
| 273 | |
| 274 | |
| 275 | =============================================================================== |
| 276 | PRACTICAL 2: PRINCIPAL COMPONENT ANALYSIS (PCA) |
| 277 | (Dimensionality Reduction while retaining maximum variance) |
| 278 | =============================================================================== |
| 279 | |
| 280 | CODE: |
| 281 | import pandas as pd |
| 282 | import numpy as np |
| 283 | from sklearn.preprocessing import StandardScaler |
| 284 | |
| 285 | df=pd.read_csv("student_dataset.csv") |
| 286 | print(df) |
| 287 | |
| 288 | scaler=StandardScaler() |
| 289 | df1=scaler.fit_transform(df.iloc[:,:3]) |
| 290 | print(df1) |
| 291 | |
| 292 | cov_matrix = np.cov(df1.T) |
| 293 | print("COVARIANCE MATRIX:\n", cov_matrix) |
| 294 | |
| 295 | eig_val,eig_vect=np.linalg.eig(cov_matrix) |
| 296 | print("\nEigen Values\n",eig_val) |
| 297 | print("Eigen Vectors\n",eig_vect) |
| 298 | |
| 299 | pc = eig_vect[:,[0, 2]] |
| 300 | pc=pc.T |
| 301 | print("\nTop 2 Principal Components:\n", pc) |
| 302 | |
| 303 | trans_df = np.dot(df1[:,0:3], pc.T) |
| 304 | print(" \nNew Transform\n",trans_df) |
| 305 | |
| 306 | Dataf=pd.DataFrame(trans_df,columns=['PC1','PC2']) |
| 307 | Dataf['GTU Marks']=df['GTU'].values |
| 308 | print(Dataf) |
| 309 | |
| 310 | OUTPUT: |
| 311 | Mid_Sem IQ HSC GTU |
| 312 | 0 35 110 78 70 |
| 313 | 1 42 125 85 88 |
| 314 | 2 28 100 72 65 |
| 315 | 3 45 130 90 92 |
| 316 | 4 38 115 80 78 |
| 317 | ... |
| 318 | [15 rows x 4 columns] |
| 319 | |
| 320 | [[-0.09736702 -0.20785572 -0.20441405] |
| 321 | [ 1.03858157 1.20934235 0.81765621] |
| 322 | [-1.23331562 -1.15265443 -1.08047428] |
| 323 | [ 1.52541669 1.68174171 1.5477064 ] |
| 324 | ... |
| 325 | [-0.74648051 -0.96369469 -0.93446424]] |
| 326 | |
| 327 | COVARIANCE MATRIX: |
| 328 | [[1.07142857 1.0614152 1.05676449] |
| 329 | [1.0614152 1.07142857 1.05019437] |
| 330 | [1.05676449 1.05019437 1.07142857]] |
| 331 | |
| 332 | Eigen Values |
| 333 | [3.18368463 0.00878971 0.02181137] |
| 334 | |
| 335 | Eigen Vectors |
| 336 | [[-0.57842869 -0.7974863 -0.17156877] |
| 337 | [-0.57723546 0.54876897 -0.60469152] |
| 338 | [-0.57638483 0.25073535 0.77776109]] |
| 339 | |
| 340 | Top 2 Principal Components: |
| 341 | [[-0.57842869 -0.57723546 -0.57638483] |
| 342 | [-0.17156877 -0.60469152 0.77776109]] |
| 343 | |
| 344 | New Transform |
| 345 | [[ 0.29412273 -0.01659157] |
| 346 | [-1.77010531 -0.27352604] |
| 347 | [ 2.00150714 0.06824795] |
| 348 | [-2.74518022 -0.074903 ] |
| 349 | ... |
| 350 | [ 1.5266755 -0.01597918]] |
| 351 | |
| 352 | PC1 PC2 GTU Marks |
| 353 | 0 0.294123 -0.016592 70 |
| 354 | 1 -1.770105 -0.273526 88 |
| 355 | 2 2.001507 0.068248 65 |
| 356 | 3 -2.745180 -0.074903 92 |
| 357 | 4 -0.428478 -0.158651 78 |
| 358 | ... |
| 359 | [15 rows x 3 columns] |
| 360 | |
| 361 | |
| 362 | =============================================================================== |
| 363 | PRACTICAL 3: DECISION TREE CLASSIFIER |
| 364 | (Classification with evaluation using precision, recall, and F1-score) |
| 365 | =============================================================================== |
| 366 | |
| 367 | CODE: |
| 368 | import pandas as pd |
| 369 | from sklearn.metrics import confusion_matrix |
| 370 | from sklearn.tree import DecisionTreeClassifier |
| 371 | from sklearn.metrics import accuracy_score |
| 372 | from sklearn.metrics import classification_report |
| 373 | |
| 374 | data = pd.read_csv("decesiontree.csv") |
| 375 | print(data) |
| 376 | |
| 377 | cleanup_nums = {"Age": {"Youth": 0, "Middle": 1, "Senior" : 2}, |
| 378 | "Income": {"Low": 0, "Medium": 1, "High" : 2 }, |
| 379 | "Student": {"No": 0, "Yes":1 }, |
| 380 | "Credit Rating": { "Fair": 1, "Excellent" : 2 }, |
| 381 | "Buys-Computer": {"No": 0, "Yes": 1}} |
| 382 | data.replace(cleanup_nums, inplace = True) |
| 383 | print(data) |
| 384 | |
| 385 | predictors = data.iloc[:, 1:5] |
| 386 | target = data.iloc[:, 5] |
| 387 | |
| 388 | dtree_entropy=DecisionTreeClassifier(criterion="entropy",random_state=100, |
| 389 | max_depth=3,min_samples_leaf=5) |
| 390 | |
| 391 | OUTPUT: |
| 392 | Item no Age Income Student Credit Rating Buys-Computer |
| 393 | 0 1 Youth High No Fair No |
| 394 | 1 2 Youth High No Excellent No |
| 395 | 2 3 Middle High No Fair Yes |
| 396 | 3 4 Senior Medium No Fair Yes |
| 397 | 4 5 Senior Low Yes Fair Yes |
| 398 | ... |
| 399 | [14 rows x 6 columns] |
| 400 | |
| 401 | Item no Age Income Student Credit Rating Buys-Computer |
| 402 | 0 1 0 2 0 1 0 |
| 403 | 1 2 0 2 0 2 0 |
| 404 | 2 3 1 2 0 1 1 |
| 405 | 3 4 2 1 0 1 1 |
| 406 | 4 5 2 0 1 1 1 |
| 407 | ... |
| 408 | [14 rows x 6 columns] |
| 409 | |
| 410 | |
| 411 | =============================================================================== |
| 412 | PRACTICAL 4: NAIVE BAYES CLASSIFIER |
| 413 | (Probabilistic classification using Gaussian Naive Bayes) |
| 414 | =============================================================================== |
| 415 | |
| 416 | CODE: |
| 417 | import pandas as pd |
| 418 | from sklearn import preprocessing |
| 419 | from sklearn.naive_bayes import GaussianNB |
| 420 | |
| 421 | fl = "Naive_Bayesian.csv" |
| 422 | df = pd.read_csv(fl, index_col = "Item no") |
| 423 | print (df) |
| 424 | |
| 425 | dfCol = df.columns |
| 426 | print ("df columns: ", dfCol) |
| 427 | ndfCol = df.shape[1] |
| 428 | ndfRow = df.shape[0] |
| 429 | |
| 430 | feature = [[]*ndfRow for x in range(ndfCol)] |
| 431 | for i in range(ndfCol): |
| 432 | feature[i] = list(df[dfCol[i]]) |
| 433 | print (dfCol[i],":", feature[i]) |
| 434 | |
| 435 | le = preprocessing.LabelEncoder() |
| 436 | |
| 437 | feature0 = [[]*ndfRow for x in range(ndfCol)] |
| 438 | for i in range(ndfCol): |
| 439 | feature0[i] = le.fit_transform(feature[i]) |
| 440 | print(dfCol[i], "encoded:", feature0[i]) |
| 441 | |
| 442 | features = [] |
| 443 | for i in range(ndfRow): |
| 444 | xlst = [] |
| 445 | for j in range(ndfCol-1): |
| 446 | xlst.append(feature0[j][i]) |
| 447 | xtup = tuple(xlst) |
| 448 | features.append(xtup) |
| 449 | |
| 450 | print ("features:", features) |
| 451 | |
| 452 | label = feature0[:][ndfCol-1] |
| 453 | label = [label[i]+1 for i in range(ndfRow)] |
| 454 | print ("label:", label) |
| 455 | |
| 456 | model = GaussianNB() |
| 457 | model.fit(features, label) |
| 458 | print ("model:", model) |
| 459 | |
| 460 | ptStr = input ("Enter unknown data (separated by ,) excluding Index Column: ") |
| 461 | ptLst = [int(x) for x in ptStr.split(',')] |
| 462 | point1 = [ptLst] |
| 463 | print ("Unknown data (sample):", point1) |
| 464 | predicted= model.predict(point1) |
| 465 | print ("Class for Point:", point1, "is:", predicted) |
| 466 | |
| 467 | OUTPUT (with input: 0,1,1,0): |
| 468 | Age Income Student Credit Rating Buys-Computer |
| 469 | Item no |
| 470 | 1 Youth High No Fair No |
| 471 | 2 Youth High No Excellent No |
| 472 | 3 Middle High No Fair Yes |
| 473 | 4 Senior Medium No Fair Yes |
| 474 | ... |
| 475 | [14 rows x 5 columns] |
| 476 | |
| 477 | df columns: Index(['Age', 'Income', 'Student', 'Credit Rating', 'Buys-Computer'], dtype='object') |
| 478 | |
| 479 | Age : ['Youth', 'Youth', 'Middle', 'Senior', 'Senior', ...] |
| 480 | Income : ['High', 'High', 'High', 'Medium', 'Low', ...] |
| 481 | Student : ['No', 'No', 'No', 'No', 'Yes', ...] |
| 482 | Credit Rating : ['Fair', 'Excellent', 'Fair', 'Fair', 'Fair', ...] |
| 483 | Buys-Computer : ['No', 'No', 'Yes', 'Yes', 'Yes', ...] |
| 484 | |
| 485 | Age encoded: [2 2 0 1 1 0 1 2 2 1 2 0 0 1] |
| 486 | Income encoded: [0 0 0 2 1 1 1 2 1 2 2 2 0 2] |
| 487 | Student encoded: [0 0 0 0 1 1 1 0 1 1 1 0 1 1] |
| 488 | Credit Rating encoded: [1 0 1 1 1 0 0 1 1 1 0 0 1 0] |
| 489 | Buys-Computer encoded: [0 0 1 1 1 0 1 0 1 1 1 1 1 0] |
| 490 | |
| 491 | features: [(2, 0, 0, 1), (2, 0, 0, 0), (0, 0, 0, 1), (1, 2, 0, 1), |
| 492 | (1, 1, 1, 1), (0, 1, 1, 0), ...] |
| 493 | |
| 494 | label: [1, 1, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 1] |
| 495 | |
| 496 | model: GaussianNB() |
| 497 | |
| 498 | Enter unknown data (separated by ,) excluding Index Column: |
| 499 | Unknown data (sample): [[0, 1, 1, 0]] |
| 500 | Class for Point: [[0, 1, 1, 0]] is: [2] |
| 501 | |
| 502 | |
| 503 | =============================================================================== |
| 504 | PRACTICAL 5: LINEAR REGRESSION |
| 505 | (Predicting continuous values with evaluation using MSE and R² score) |
| 506 | =============================================================================== |
| 507 | |
| 508 | CODE: |
| 509 | import pandas as pd |
| 510 | import matplotlib.pyplot as plt |
| 511 | from sklearn.linear_model import LinearRegression |
| 512 | from sklearn.model_selection import train_test_split |
| 513 | import numpy as np |
| 514 | from sklearn import metrics |
| 515 | |
| 516 | dataset=pd.read_csv("LinearRegression.csv") |
| 517 | print(dataset) |
| 518 | |
| 519 | x=dataset.iloc[:,0:1] |
| 520 | y=dataset.iloc[:,1] |
| 521 | y=y.replace(['Yes','No'],[1,0]) |
| 522 | |
| 523 | print(y) |
| 524 | X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.01, random_state=123) |
| 525 | |
| 526 | model = LinearRegression() |
| 527 | model = model.fit(X_train, y_train) |
| 528 | y_pred = model.predict(X_test) |
| 529 | y_pred_val=model.predict([[18]]) |
| 530 | print(y_pred_val) |
| 531 | |
| 532 | if(y_pred_val > 0.5): |
| 533 | print("Yes") |
| 534 | else: |
| 535 | print("No") |
| 536 | |
| 537 | plt.scatter(X_train,y_train, color = 'red') |
| 538 | plt.plot(X_train, model.predict(X_train)) |
| 539 | |
| 540 | print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) |
| 541 | print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) |
| 542 | print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) |
| 543 | |
| 544 | OUTPUT: |
| 545 | Outside Temperature \nCelcius Wear a\n jacket |
| 546 | 0 30 No |
| 547 | 1 25 No |
| 548 | 2 20 No |
| 549 | 3 15 Yes |
| 550 | 4 10 Yes |
| 551 | |
| 552 | 0 0 |
| 553 | 1 0 |
| 554 | 2 0 |
| 555 | 3 1 |
| 556 | 4 1 |
| 557 | Name: Wear a\n jacket, dtype: int64 |
| 558 | |
| 559 | [0.54285714] |
| 560 | Yes |
| 561 | |
| 562 | Mean Absolute Error: 0.14285714285714302 |
| 563 | Mean Squared Error: 0.02040816326530617 |
| 564 | Root Mean Squared Error: 0.14285714285714302 |
| 565 | |
| 566 | |
| 567 | =============================================================================== |
| 568 | PRACTICAL 6: K-NEAREST NEIGHBORS (KNN) CLASSIFIER |
| 569 | (Classification using different k values with accuracy evaluation) |
| 570 | =============================================================================== |
| 571 | |
| 572 | CODE: |
| 573 | import pandas as pd |
| 574 | from sklearn.model_selection import train_test_split |
| 575 | from sklearn.preprocessing import StandardScaler |
| 576 | from sklearn.neighbors import KNeighborsClassifier |
| 577 | from sklearn.metrics import accuracy_score, classification_report |
| 578 | |
| 579 | df = pd.read_csv("knn.csv") |
| 580 | df = df[df['Item no.'].notna()] |
| 581 | print("Dataset Preview:") |
| 582 | print(df.head()) |
| 583 | |
| 584 | X = df.iloc[:, 1:4] |
| 585 | y = df.iloc[:, 4] |
| 586 | print("INPUT\n",X) |
| 587 | print("OUTPUT\n",y) |
| 588 | |
| 589 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 590 | |
| 591 | scaler = StandardScaler() |
| 592 | X_train = scaler.fit_transform(X_train) |
| 593 | X_test = scaler.transform(X_test) |
| 594 | print("XTRAIN\n",X_train) |
| 595 | print("X_TEST\n",X_test) |
| 596 | |
| 597 | knn = KNeighborsClassifier(n_neighbors=3) |
| 598 | knn.fit(X_train, y_train) |
| 599 | |
| 600 | y_pred = knn.predict(X_test) |
| 601 | print("PREDICTION : \n",y_pred) |
| 602 | |
| 603 | print("\nAccuracy:", accuracy_score(y_test, y_pred)) |
| 604 | print("\nClassification Report:") |
| 605 | print(classification_report(y_test, y_pred)) |
| 606 | |
| 607 | OUTPUT: |
| 608 | Dataset Preview: |
| 609 | Item no. Temp Humidity Wind Speed Play ... |
| 610 | 0 1.0 85.0 85.0 12.0 No ... |
| 611 | 1 2.0 80.0 90.0 9.0 No ... |
| 612 | 2 3.0 83.0 86.0 4.0 Yes ... |
| 613 | 3 4.0 70.0 96.0 3.0 Yes ... |
| 614 | 4 5.0 68.0 80.0 5.0 Yes ... |
| 615 | |
| 616 | INPUT |
| 617 | Temp Humidity Wind Speed |
| 618 | 0 85.0 85.0 12.0 |
| 619 | 1 80.0 90.0 9.0 |
| 620 | 2 83.0 86.0 4.0 |
| 621 | 3 70.0 96.0 3.0 |
| 622 | 4 68.0 80.0 5.0 |
| 623 | ... |
| 624 | [14 rows x 3 columns] |
| 625 | |
| 626 | OUTPUT |
| 627 | 0 No |
| 628 | 1 No |
| 629 | 2 Yes |
| 630 | 3 Yes |
| 631 | 4 Yes |
| 632 | ... |
| 633 | Name: Play, dtype: object |
| 634 | |
| 635 | XTRAIN |
| 636 | [[ 1.37690922 -0.53048047 -0.46006855] |
| 637 | [-1.22885447 -0.99359834 2.25104967] |
| 638 | [-0.57741354 -0.99359834 -0.46006855] |
| 639 | [ 1.70262968 0.48837885 -0.64080976] |
| 640 | ... |
| 641 | [-1.3917147 -1.45671621 -1.00229219]] |
| 642 | |
| 643 | X_TEST |
| 644 | [[ 0.39974784 -0.0673626 -1.00229219] |
| 645 | [-0.08883285 0.85887314 -0.64080976] |
| 646 | [ 2.02835014 0.39575527 0.80511996]] |
| 647 | |
| 648 | PREDICTION : |
| 649 | ['Yes' 'Yes' 'Yes'] |
| 650 | |
| 651 | Accuracy: 0.6666666666666666 |
| 652 | |
| 653 | Classification Report: |
| 654 | precision recall f1-score support |
| 655 | |
| 656 | No 0.00 0.00 0.00 1 |
| 657 | Yes 0.67 1.00 0.80 2 |
| 658 | |
| 659 | accuracy 0.67 3 |
| 660 | macro avg 0.33 0.50 0.40 3 |
| 661 | weighted avg 0.44 0.67 0.53 3 |
| 662 | |
| 663 | |
| 664 | =============================================================================== |
| 665 | PRACTICAL 7: MULTIPLE LINEAR REGRESSION |
| 666 | (Prediction using multiple features with R² score and RMSE evaluation) |
| 667 | =============================================================================== |
| 668 | |
| 669 | CODE: |
| 670 | import pandas as pd |
| 671 | import numpy as np |
| 672 | import matplotlib.pyplot as plt |
| 673 | from sklearn.linear_model import LinearRegression |
| 674 | from sklearn.model_selection import train_test_split |
| 675 | from sklearn.metrics import r2_score |
| 676 | |
| 677 | Data=pd.read_excel("student_data1.xlsx") |
| 678 | print(Data) |
| 679 | |
| 680 | X=Data.iloc[:,:2] |
| 681 | y=Data.iloc[:,-1:] |
| 682 | print(X) |
| 683 | print(y) |
| 684 | |
| 685 | X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2,random_state=42) |
| 686 | |
| 687 | print("Xtrain\n",X_train) |
| 688 | print("XTEST\n",y_test) |
| 689 | |
| 690 | model = LinearRegression() |
| 691 | model.fit(X_train.to_numpy(), y_train) |
| 692 | |
| 693 | y_pred = model.predict([[8.6,125]]) |
| 694 | print("model prediction on Ytest:\n",y_pred.round(2)) |
| 695 | |
| 696 | print("M= ",model.coef_.round(2)) |
| 697 | print("b= ",model.intercept_.round(2)) |
| 698 | |
| 699 | OUTPUT: |
| 700 | CGPA IQ Placement (LPA) |
| 701 | 0 7.5 110 6.5 |
| 702 | 1 8.0 120 7.0 |
| 703 | 2 8.5 125 8.2 |
| 704 | 3 9.0 130 9.1 |
| 705 | 4 6.5 100 5.0 |
| 706 | ... |
| 707 | [10 rows x 3 columns] |
| 708 | |
| 709 | CGPA IQ |
| 710 | 0 7.5 110 |
| 711 | 1 8.0 120 |
| 712 | 2 8.5 125 |
| 713 | 3 9.0 130 |
| 714 | 4 6.5 100 |
| 715 | ... |
| 716 | [10 rows x 2 columns] |
| 717 | |
| 718 | Placement (LPA) |
| 719 | 0 6.5 |
| 720 | 1 7.0 |
| 721 | 2 8.2 |
| 722 | 3 9.1 |
| 723 | 4 5.0 |
| 724 | ... |
| 725 | [10 rows x 1 columns] |
| 726 | |
| 727 | Xtrain |
| 728 | CGPA IQ |
| 729 | 5 7.0 105 |
| 730 | 0 7.5 110 |
| 731 | 7 8.8 128 |
| 732 | 2 8.5 125 |
| 733 | ... |
| 734 | [8 rows x 2 columns] |
| 735 | |
| 736 | XTEST |
| 737 | Placement (LPA) |
| 738 | 8 5.2 |
| 739 | 1 7.0 |
| 740 | |
| 741 | model prediction on Ytest: |
| 742 | [[8.45]] |
| 743 | |
| 744 | M= [[1.32 0.03]] |
| 745 | b= [-6.51] |
| 746 | |
| 747 | |
| 748 | =============================================================================== |
| 749 | PRACTICAL 8: SINGULAR VALUE DECOMPOSITION (SVD) |
| 750 | (Dimensionality Reduction using SVD - Manual & Sklearn Implementation) |
| 751 | =============================================================================== |
| 752 | |
| 753 | ------- 8.1: SVD MANUAL IMPLEMENTATION (svd.py) ------- |
| 754 | |
| 755 | CODE: |
| 756 | import pandas as pd |
| 757 | import numpy as np |
| 758 | |
| 759 | df = pd.read_excel("student_dataset.xlsx") |
| 760 | A = df.iloc[:, :3].to_numpy() |
| 761 | A_mean = A - np.mean(A, axis=0) |
| 762 | |
| 763 | U, X, V_T = np.linalg.svd(A_mean) |
| 764 | k = 2 |
| 765 | U_k = U[:, :k] |
| 766 | S_k = np.diag(X[:k]) |
| 767 | |
| 768 | final_data1 = np.dot(U_k, S_k) |
| 769 | print("Reduced Data:\n", final_data1) |
| 770 | |
| 771 | explained_variance = (X[:k]**2) / np.sum(X**2) |
| 772 | print("Explained variance by top 2 components:", explained_variance) |
| 773 | |
| 774 | reduced_df = pd.DataFrame(final_data1, columns=["PC1", "PC2"]) |
| 775 | reduced_df['GTU'] = df['GTU'].values |
| 776 | print(reduced_df) |
| 777 | |
| 778 | OUTPUT: |
| 779 | Reduced Data: |
| 780 | [[ -2.60622042 0.08983428] |
| 781 | [ 15.20533711 -2.22651162] |
| 782 | [-16.15266994 0.28962606] |
| 783 | [ 22.72992624 -0.77843972] |
| 784 | [ 3.46193108 -0.87192496] |
| 785 | ... |
| 786 | [-12.83777493 0.27648847]] |
| 787 | |
| 788 | Explained variance by top 2 components: [0.99132896 0.00672569] |
| 789 | |
| 790 | PC1 PC2 GTU |
| 791 | 0 -2.606220 0.089834 70 |
| 792 | 1 15.205337 -2.226512 88 |
| 793 | 2 -16.152670 0.289626 65 |
| 794 | 3 22.729926 -0.778440 92 |
| 795 | 4 3.461931 -0.871925 78 |
| 796 | ... |
| 797 | [15 rows x 3 columns] |
| 798 | |
| 799 | ------- 8.2: SVD USING SKLEARN (svd2.py) ------- |
| 800 | |
| 801 | CODE: |
| 802 | import pandas as pd |
| 803 | import numpy as np |
| 804 | from sklearn.decomposition import TruncatedSVD |
| 805 | from sklearn.preprocessing import StandardScaler |
| 806 | |
| 807 | df = pd.read_excel("student_dataset.xlsx") |
| 808 | X = df.iloc[:, :3] |
| 809 | |
| 810 | scaler = StandardScaler() |
| 811 | X_scaled = scaler.fit_transform(X) |
| 812 | |
| 813 | svd = TruncatedSVD(n_components=2) |
| 814 | X_reduced = svd.fit_transform(X_scaled) |
| 815 | print(X_reduced) |
| 816 | |
| 817 | Dataf=pd.DataFrame(X_reduced,columns=['PC1','PC2']) |
| 818 | Dataf['GTU Marks']=df['GTU'].values |
| 819 | print(Dataf) |
| 820 | |
| 821 | print("Singular values:", svd.singular_values_) |
| 822 | print("Explained variance:", svd.explained_variance_) |
| 823 | print("Explained variance ratio:", svd.explained_variance_ratio_) |
| 824 | print("Total variance captured:", svd.explained_variance_ratio_.sum()) |
| 825 | |
| 826 | OUTPUT: |
| 827 | [[-0.29412273 -0.01659157] |
| 828 | [ 1.77010531 -0.27352604] |
| 829 | [-2.00150714 0.06824795] |
| 830 | [ 2.74518022 -0.074903 ] |
| 831 | [ 0.42847827 -0.1586513 ] |
| 832 | ... |
| 833 | [-1.5266755 -0.01597918]] |
| 834 | |
| 835 | PC1 PC2 GTU Marks |
| 836 | 0 -0.294123 -0.016592 70 |
| 837 | 1 1.770105 -0.273526 88 |
| 838 | 2 -2.001507 0.068248 65 |
| 839 | 3 2.745180 -0.074903 92 |
| 840 | 4 0.428478 -0.158651 78 |
| 841 | ... |
| 842 | [15 rows x 3 columns] |
| 843 | |
| 844 | Singular values: [6.67619539 0.55259316] |
| 845 | Explained variance: [2.97143899 0.02035728] |
| 846 | Explained variance ratio: [0.99047966 0.00678576] |
| 847 | Total variance captured: 0.997265423131314 |
| 848 | |
| 849 | |
| 850 | =============================================================================== |
| 851 | END OF JOURNAL |
| 852 | =============================================================================== |