""" author: Wojciech Janota laboratory: Lab 2, ex 2 """ import pandas as pd from scipy.stats import zscore input_data = pd.read_csv("zb_6.txt", sep=',') # replace all non-numeric values to NaN input_data = input_data.map(lambda x: pd.to_numeric(x, errors='coerce')) # replace all NaN values with median input_data.fillna(input_data.median(), inplace=True) # # Check for NaN values in prepared DataFrame # nan_mask = input_data.isna().any(axis=1) # rows_with_nan = input_data[nan_mask] # print(rows_with_nan) # Find outliers using z-score (interquartile range) numeric_columns = input_data[['a_1', 'a_2', 'a_3']] z_scores = zscore(numeric_columns) # Threshold for finding outliers (over and under 3 std dev) threshold = 3 # Matrix of detected outliers outliers = (abs(z_scores > threshold).any(axis=1)) # Filter out only the rows with outliers outliers_rows = input_data[outliers] outliers_rows.to_csv("detected_outliers.csv", sep=',', index_label="index") # Remove detected outliers form the original data and write to new file cleaned_data = input_data.drop(outliers_rows.index) cleaned_data.to_csv("zb_6_cleaned_outliers.csv", sep=',', index_label="index")