You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

41 lines
1.1 KiB
Python

10 months ago
"""
author: Wojciech Janota
laboratory: Lab 2, ex 2
"""
import pandas as pd
from scipy.stats import zscore
input_data = pd.read_csv("zb_6.txt", sep=',')
# replace all non-numeric values to NaN
input_data = input_data.map(lambda x: pd.to_numeric(x, errors='coerce'))
# replace all NaN values with median
input_data.fillna(input_data.median(), inplace=True)
# # Check for NaN values in prepared DataFrame
# nan_mask = input_data.isna().any(axis=1)
# rows_with_nan = input_data[nan_mask]
# print(rows_with_nan)
# Find outliers using z-score (interquartile range)
numeric_columns = input_data[['a_1', 'a_2', 'a_3']]
z_scores = zscore(numeric_columns)
# Threshold for finding outliers (over and under 3 std dev)
threshold = 3
# Matrix of detected outliers
outliers = (abs(z_scores > threshold).any(axis=1))
# Filter out only the rows with outliers
outliers_rows = input_data[outliers]
outliers_rows.to_csv("detected_outliers.csv", sep=',', index_label="index")
# Remove detected outliers form the original data and write to new file
cleaned_data = input_data.drop(outliers_rows.index)
cleaned_data.to_csv("zb_6_cleaned_outliers.csv", sep=',', index_label="index")