You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
41 lines
1.1 KiB
Python
41 lines
1.1 KiB
Python
10 months ago
|
"""
|
||
|
author: Wojciech Janota
|
||
|
laboratory: Lab 2, ex 2
|
||
|
"""
|
||
|
|
||
|
import pandas as pd
|
||
|
from scipy.stats import zscore
|
||
|
|
||
|
input_data = pd.read_csv("zb_6.txt", sep=',')
|
||
|
|
||
|
# replace all non-numeric values to NaN
|
||
|
input_data = input_data.map(lambda x: pd.to_numeric(x, errors='coerce'))
|
||
|
|
||
|
# replace all NaN values with median
|
||
|
input_data.fillna(input_data.median(), inplace=True)
|
||
|
|
||
|
# # Check for NaN values in prepared DataFrame
|
||
|
# nan_mask = input_data.isna().any(axis=1)
|
||
|
# rows_with_nan = input_data[nan_mask]
|
||
|
# print(rows_with_nan)
|
||
|
|
||
|
# Find outliers using z-score (interquartile range)
|
||
|
|
||
|
numeric_columns = input_data[['a_1', 'a_2', 'a_3']]
|
||
|
|
||
|
z_scores = zscore(numeric_columns)
|
||
|
|
||
|
# Threshold for finding outliers (over and under 3 std dev)
|
||
|
threshold = 3
|
||
|
|
||
|
# Matrix of detected outliers
|
||
|
outliers = (abs(z_scores > threshold).any(axis=1))
|
||
|
# Filter out only the rows with outliers
|
||
|
outliers_rows = input_data[outliers]
|
||
|
|
||
|
outliers_rows.to_csv("detected_outliers.csv", sep=',', index_label="index")
|
||
|
|
||
|
# Remove detected outliers form the original data and write to new file
|
||
|
cleaned_data = input_data.drop(outliers_rows.index)
|
||
|
cleaned_data.to_csv("zb_6_cleaned_outliers.csv", sep=',', index_label="index")
|