MACHINE LEARNING LAB

1 Create a simple dataset and then separate features and labels based on a case study of predicting whether a student will pass or fail an exam based on their study hours and sleep hours. 

Case Study: Student Exam Performance Features: 

1. Hours Studied 

2. Hours Slept

 Label: 1. Pass (1) or Fail (0)

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

# Creating the dataset

data = {

 'Hours_Studied': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],

 'Hours_Slept': [5, 6, 5, 4, 5, 6, 7, 8, 6, 5],

 'Pass': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

 }

# Convert to DataFrame

df = pd.DataFrame(data)

# Display the dataset

print("Dataset:")

print(df)

# Separate features and labels

X = df[['Hours_Studied', 'Hours_Slept']]

y = df['Pass']

# Display features and labels

print("\nFeatures (X):")

print(X)

print("\nLabels (y):")

print(y)

# Plotting the data

colors = {0: 'red', 1: 'green'}

plt.figure(figsize=(10, 6))

plt.scatter(df['Hours_Studied'], df['Hours_Slept'], c=df['Pass'].map(colors))

plt.xlabel('Hours Studied')

plt.ylabel('Hours Slept')

plt.title('Student Exam Performance')

plt.grid(True)

plt.show()

2. Create a dataset that includes missing values, outliers, and duplicate entries. Clean the data by handling these issues and visualize the data before and after cleaning.


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt


# Creating the dataset with intentional issues

data = {

    'Employee_ID': [101, 102, 103, 104, 105, 106, 107, 108,

                    109, 110, 105, 106],

    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward',

             'Frank', 'Grace', 'Helen', np.nan, 'Ivan', 'Edward', 'Frank'],

    'Age': [25, 30, 35, np.nan, 40, 45, 50, 55, 60, 65, 40, 45],

    'Department': ['HR', 'Finance', 'IT', 'IT', 'HR',

                   'Finance', 'Finance', 'IT', 'HR', np.nan, 'HR', 'Finance'],

    # Fixed the trailing comma and made sure all values are valid

    'Salary': [50000, 60000, 70000, 80000, 90000, 100000,

               110000, 120000, 1000000, 140000, 90000, 100000]

}


# Convert to DataFrame

df = pd.DataFrame(data)


# Display the dataset with issues

print("Dataset with issues:")

print(df)


# Plotting the data to visualize issues

plt.figure(figsize=(10, 6))

plt.scatter(df['Employee_ID'], df['Salary'], color='blue')

plt.xlabel('Employee ID')

plt.ylabel('Salary')

plt.title('Employee Salary Data with Issues')

plt.grid(True)

plt.show()


# Handling missing values

df['Age'].fillna(df['Age'].mean(), inplace=True)

df['Name'].fillna('Unknown', inplace=True)

df['Department'].fillna('Unknown', inplace=True)


# Correcting data types

df['Employee_ID'] = df['Employee_ID'].astype(int)

df['Age'] = df['Age'].astype(int)


# Dealing with outliers (any salary > 200000 replaced with median)

df.loc[df['Salary'] > 200000, 'Salary'] = df['Salary'].median()


# Removing duplicate entries

df.drop_duplicates(inplace=True)


# Display cleaned dataset

print("\nCleaned Dataset:")

print(df)


# Plotting the cleaned data

plt.figure(figsize=(10, 6))

plt.scatter(df['Employee_ID'], df['Salary'], color='green')

plt.xlabel('Employee ID')

plt.ylabel('Salary')

plt.title('Cleaned Employee Salary Data')

plt.grid(True)

plt.show()


3. Create a simple dataset and demonstrate basic data transformation technique such as normalization. Case Study: 
Student Scores Features: 
1. Student ID 
2. Score 
3. Grade (categorical: 'A', 'B', 'C') 
Step 1: Create the Dataset
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# Creating the dataset
data = {
 'Student_ID': [1, 2, 3, 4, 5],
 'Score': [75, 85, 95, 65, 55]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Display the original dataset
print("Original Dataset:")
print(df)

# Normalization (Min-Max Scaling)
scaler = MinMaxScaler()
df['Score'] = scaler.fit_transform(df[['Score']])
# Display the transformed dataset
print("\nTransformed Dataset:")
print(df)

# Plotting the data before and after transformation
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Original Scores
axs[0].bar(data['Student_ID'], data['Score'], color='blue')
axs[0].set_xlabel('Student ID')
axs[0].set_ylabel('Score')
axs[0].set_title('Original Scores')
axs[0].grid(True)
# Transformed Scores
axs[1].bar(df['Student_ID'], df['Score'], color='green')
axs[1].set_xlabel('Student ID')
axs[1].set_ylabel('Normalized Score')
axs[1].set_title('Normalized Scores')
axs[1].grid(True)
plt.tight_layout()
plt.show()


4. Create a simple dataset for a case study on data integration while considering redundancy. 

To handle redundancy in the dataset, we should ensure that each student's information (such as name and age) appears only once in the merged dataset, while their exam scores across different subjects are appropriately integrated. Here's how we can modify the code to achieve this: 

Case Study: Student Information Integration with Redundancy Handling Source

 1: Basic Student Details

  Student ID 

 Name

  Age 

Source 2: Exam Scores

  Student ID 

 Subject 

 Score

 Dataset Creation 

import pandas as pd

import matplotlib.pyplot as plt


# Creating source data

data1 = {

    'Student_ID': [1, 2, 3, 4, 5],

    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],

    'Age': [21, 22, 20, 23, 21]

}


data2 = {

    'Student_ID': [1, 2, 3, 4, 5],

    'Subject': ['Math', 'Math', 'Math', 'Math', 'Math'],

    'Score': [85, 90, 78, 92, 88]

}


data3 = {

    'Student_ID': [1, 2, 3, 4, 5],

    'Subject': ['Science', 'Science', 'Science', 'Science', 'Science'],

    'Score': [75, 80, 82, 79, 85]

}


# Convert to DataFrame

df1 = pd.DataFrame(data1)

df2 = pd.concat([pd.DataFrame(data2), pd.DataFrame(data3)], ignore_index=True)


print("Dataset 1:")

print(df1)

print("\nDataset 2:")

print(df2)


# Merge the two datasets based on 'Student_ID'

merged_df = pd.merge(df1.drop_duplicates(), df2, on='Student_ID')


# Display the merged dataset

print("\nMerged Dataset:")

print(merged_df)


# Visualization

# Plotting the integrated data

fig, ax = plt.subplots(figsize=(10, 6))


# Scatter plot for scores by subject

subjects = merged_df['Subject'].unique()

colors = ['blue', 'green']


for i, subject in enumerate(subjects):

    subject_data = merged_df[merged_df['Subject'] == subject]

    ax.scatter(subject_data['Student_ID'],

               subject_data['Score'],

               label=subject,

               color=colors[i],

               s=100)


ax.set_xlabel('Student ID')

ax.set_ylabel('Score')

ax.set_title('Exam Scores by Subject (Redundancy Handled)')

ax.legend()

ax.grid(True)


plt.tight_layout()

plt.show()

5. Create a dataset to demonstrate dimensionality reduction where we have student exam scores for multiple subjects, and reduce the dimensionality by combining scores from related subjects. 
Case Study: Student Exam Performance
 Dataset Creation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Creating source data
np.random.seed(0)
data = {
    'Student_ID': range(1, 11),
    'Math_Score': np.random.randint(60, 100, 10),
    'Science_Score': np.random.randint(65, 95, 10),
    'History_Score': np.random.randint(55, 90, 10),
    'English_Score': np.random.randint(50, 85, 10)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the original dataset
print("Original Dataset:")
print(df)

# Manually reduce dimensionality
df['Math_Science_Score'] = df['Math_Score'] + df['Science_Score']
df['History_English_Score'] = df['History_Score'] + df['English_Score']

# Select relevant columns for visualization
reduced_df = df[['Student_ID', 'Math_Science_Score', 'History_English_Score']]

# Display the reduced dataset
print("\nReduced Dataset without PCA:")
print(reduced_df)

# Plotting the original and reduced data
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Original Scores
axs[0].scatter(df['Math_Score'], df['Science_Score'], color='blue', label='Math vs Science')
axs[0].scatter(df['History_Score'], df['English_Score'], color='green', label='History vs English')
axs[0].set_xlabel('Scores')
axs[0].set_ylabel('Scores')
axs[0].set_title('Original Scores')
axs[0].legend()
axs[0].grid(True)

# Reduced Scores
axs[1].scatter(reduced_df['Math_Science_Score'], reduced_df['History_English_Score'], color='purple')
axs[1].set_xlabel('Math_Science_Score')
axs[1].set_ylabel('History_English_Score')
axs[1].set_title('Reduced Scores')
axs[1].grid(True)

plt.tight_layout()
plt.show()


9. Create a simple linear regression model using scikit-learn to predict house prices based on house sizes

# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

# Sample Data
house_size = np.array([1500, 1600, 1700, 1800, 1900])
house_price = np.array([300000, 320000, 340000, 360000, 380000])

# Reshape the Data
house_size = house_size.reshape(-1, 1)

# Create and Train the Model
model = LinearRegression()
model.fit(house_size, house_price)

# Predict
predicted_price = model.predict([[2000]])

# Print Predicted Price
print(f'Predicted price for a 2000 sq.ft house: ${predicted_price[0]:.2f}')

# Plotting
plt.scatter(house_size, house_price, color='blue', label='Actual Data')
plt.plot(house_size, model.predict(house_size), color='red', label='Regression Line')
plt.xlabel('House Size (sq.ft)')
plt.ylabel('House Price ($)')
plt.title('House Price Prediction')
plt.legend()
plt.grid(True)
plt.show()


Post a Comment

0 Comments