1 Create a simple dataset and then separate features and labels based on a case study of predicting whether a student will pass or fail an exam based on their study hours and sleep hours.
Case Study: Student Exam Performance Features:
1. Hours Studied
2. Hours Slept
Label: 1. Pass (1) or Fail (0)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Creating the dataset
data = {
'Hours_Studied': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Hours_Slept': [5, 6, 5, 4, 5, 6, 7, 8, 6, 5],
'Pass': [0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Display the dataset
print("Dataset:")
print(df)
# Separate features and labels
X = df[['Hours_Studied', 'Hours_Slept']]
y = df['Pass']
# Display features and labels
print("\nFeatures (X):")
print(X)
print("\nLabels (y):")
print(y)
# Plotting the data
colors = {0: 'red', 1: 'green'}
plt.figure(figsize=(10, 6))
plt.scatter(df['Hours_Studied'], df['Hours_Slept'], c=df['Pass'].map(colors))
plt.xlabel('Hours Studied')
plt.ylabel('Hours Slept')
plt.title('Student Exam Performance')
plt.grid(True)
plt.show()
2. Create a dataset that includes missing values, outliers, and duplicate entries. Clean the data by handling these issues and visualize the data before and after cleaning.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# Creating the dataset with intentional issues
data = {
'Employee_ID': [101, 102, 103, 104, 105, 106, 107, 108,
109, 110, 105, 106],
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward',
'Frank', 'Grace', 'Helen', np.nan, 'Ivan', 'Edward', 'Frank'],
'Age': [25, 30, 35, np.nan, 40, 45, 50, 55, 60, 65, 40, 45],
'Department': ['HR', 'Finance', 'IT', 'IT', 'HR',
'Finance', 'Finance', 'IT', 'HR', np.nan, 'HR', 'Finance'],
# Fixed the trailing comma and made sure all values are valid
'Salary': [50000, 60000, 70000, 80000, 90000, 100000,
110000, 120000, 1000000, 140000, 90000, 100000]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Display the dataset with issues
print("Dataset with issues:")
print(df)
# Plotting the data to visualize issues
plt.figure(figsize=(10, 6))
plt.scatter(df['Employee_ID'], df['Salary'], color='blue')
plt.xlabel('Employee ID')
plt.ylabel('Salary')
plt.title('Employee Salary Data with Issues')
plt.grid(True)
plt.show()
# Handling missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Name'].fillna('Unknown', inplace=True)
df['Department'].fillna('Unknown', inplace=True)
# Correcting data types
df['Employee_ID'] = df['Employee_ID'].astype(int)
df['Age'] = df['Age'].astype(int)
# Dealing with outliers (any salary > 200000 replaced with median)
df.loc[df['Salary'] > 200000, 'Salary'] = df['Salary'].median()
# Removing duplicate entries
df.drop_duplicates(inplace=True)
# Display cleaned dataset
print("\nCleaned Dataset:")
print(df)
# Plotting the cleaned data
plt.figure(figsize=(10, 6))
plt.scatter(df['Employee_ID'], df['Salary'], color='green')
plt.xlabel('Employee ID')
plt.ylabel('Salary')
plt.title('Cleaned Employee Salary Data')
plt.grid(True)
plt.show()
Step 1: Create the Dataset
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# Creating the dataset
data = {
'Student_ID': [1, 2, 3, 4, 5],
'Score': [75, 85, 95, 65, 55]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Display the original dataset
print("Original Dataset:")
print(df)
# Normalization (Min-Max Scaling)
scaler = MinMaxScaler()
df['Score'] = scaler.fit_transform(df[['Score']])
# Display the transformed dataset
print("\nTransformed Dataset:")
print(df)
# Plotting the data before and after transformation
fig, axs = plt.subplots(1, 2, figsize=(12, 5))
# Original Scores
axs[0].bar(data['Student_ID'], data['Score'], color='blue')
axs[0].set_xlabel('Student ID')
axs[0].set_ylabel('Score')
axs[0].set_title('Original Scores')
axs[0].grid(True)
# Transformed Scores
axs[1].bar(df['Student_ID'], df['Score'], color='green')
axs[1].set_xlabel('Student ID')
axs[1].set_ylabel('Normalized Score')
axs[1].set_title('Normalized Scores')
axs[1].grid(True)
plt.tight_layout()
plt.show()
4. Create a simple dataset for a case study on data integration while considering redundancy.
To handle redundancy in the dataset, we should ensure that each student's information (such as name and age) appears only once in the merged dataset, while their exam scores across different subjects are appropriately integrated. Here's how we can modify the code to achieve this:
Case Study: Student Information Integration with Redundancy Handling Source
1: Basic Student Details
Student ID
Name
Age
Source 2: Exam Scores
Student ID
Subject
Score
Dataset Creation
import pandas as pd
import matplotlib.pyplot as plt
# Creating source data
data1 = {
'Student_ID': [1, 2, 3, 4, 5],
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [21, 22, 20, 23, 21]
}
data2 = {
'Student_ID': [1, 2, 3, 4, 5],
'Subject': ['Math', 'Math', 'Math', 'Math', 'Math'],
'Score': [85, 90, 78, 92, 88]
}
data3 = {
'Student_ID': [1, 2, 3, 4, 5],
'Subject': ['Science', 'Science', 'Science', 'Science', 'Science'],
'Score': [75, 80, 82, 79, 85]
}
# Convert to DataFrame
df1 = pd.DataFrame(data1)
df2 = pd.concat([pd.DataFrame(data2), pd.DataFrame(data3)], ignore_index=True)
print("Dataset 1:")
print(df1)
print("\nDataset 2:")
print(df2)
# Merge the two datasets based on 'Student_ID'
merged_df = pd.merge(df1.drop_duplicates(), df2, on='Student_ID')
# Display the merged dataset
print("\nMerged Dataset:")
print(merged_df)
# Visualization
# Plotting the integrated data
fig, ax = plt.subplots(figsize=(10, 6))
# Scatter plot for scores by subject
subjects = merged_df['Subject'].unique()
colors = ['blue', 'green']
for i, subject in enumerate(subjects):
subject_data = merged_df[merged_df['Subject'] == subject]
ax.scatter(subject_data['Student_ID'],
subject_data['Score'],
label=subject,
color=colors[i],
s=100)
ax.set_xlabel('Student ID')
ax.set_ylabel('Score')
ax.set_title('Exam Scores by Subject (Redundancy Handled)')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()
0 Comments