diff --git a/lab1/dataset_plot.png b/lab1/dataset_plot.png new file mode 100644 index 0000000..17492eb Binary files /dev/null and b/lab1/dataset_plot.png differ diff --git a/lab1/preprocess.py b/lab1/preprocess.py index 1e57e26..0780cc4 100644 --- a/lab1/preprocess.py +++ b/lab1/preprocess.py @@ -1,8 +1,160 @@ import pandas as pd - +from sklearn.preprocessing import MinMaxScaler +import matplotlib.pyplot as plt input_data = pd.read_csv("annthyroid_unsupervised_anomaly_detection.csv", sep=";") print(input_data.head(10)) # deal with missing values -print(input_data[ input_data.isna().any(axis=1) ]) \ No newline at end of file + +# print all rows with missing values +print("---All rows with missing values---") +print(input_data[ input_data.isna().any(axis=1) ]) + +# Check for missing values in each column +missing_values = input_data.isnull().sum() + +# Filter columns with missing values +columns_with_missing_values = missing_values[missing_values > 0].index + +# Print columns with missing values +print("---Columns with missing values---") +print(columns_with_missing_values) + +print("Filling column 'on_antithyroid_medication' with mode of this column") +input_data["on_antithyroid_medication"] = input_data["on_antithyroid_medication"].fillna(input_data["on_antithyroid_medication"].mode().iloc[0]) + +print("Filling column 'pregnant' with mode of this column") +input_data["pregnant"] = input_data["pregnant"].fillna(input_data["pregnant"].mode().iloc[0]) + +print("Filling column 'thyroid_surgery' with mode of this column") +input_data["thyroid_surgery"] = input_data["thyroid_surgery"].fillna(input_data["thyroid_surgery"].mode().iloc[0]) + +print("Filling column 'query_hypothyroid' with mode of this column") +input_data["query_hypothyroid"] = input_data["query_hypothyroid"].fillna(input_data["query_hypothyroid"].mode().iloc[0]) + +print("Filling column 'query_hyperthyroid' with mode of this column") +input_data["query_hyperthyroid"] = input_data["query_hyperthyroid"].fillna(input_data["query_hyperthyroid"].mode().iloc[0]) + +print("Filling column 'lithium' with mode of this column") +input_data["lithium"] = input_data["lithium"].fillna(input_data["lithium"].mode().iloc[0]) + +print("Filling column 'TSH' with median of this column") +input_data["TSH"] = input_data["TSH"].fillna(input_data["TSH"].median()) + +print("Filling column 'T3_measured' with median of this column") +input_data["T3_measured"] = input_data["T3_measured"].fillna(input_data["T3_measured"].median()) + +print("Filling column 'TT4_measured' with median of this column") +input_data["TT4_measured"] = input_data["TT4_measured"].fillna(input_data["TT4_measured"].median()) + +print("Filling column 'T4U_measured' with median of this column") +input_data["T4U_measured"] = input_data["T4U_measured"].fillna(input_data["T4U_measured"].median()) + +print("Filling column 'FTI_measured' with median of this column") +input_data["FTI_measured"] = input_data["FTI_measured"].fillna(input_data["FTI_measured"].median()) + +print("---Checking after filling N/A values---") +# Check for missing values in each column +missing_values = input_data.isnull().sum() + +# Filter columns with missing values +columns_with_missing_values = missing_values[missing_values > 0].index + +# Print columns with missing values +print("---Columns with missing values (after filling)---") +print(columns_with_missing_values) + +# print all rows with missing values +print("---Rows with missing values (after filling)---") +print(input_data[ input_data.isna().any(axis=1) ]) + +columns_with_continuous_attributes = ["TSH", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"] + +# normalize columns with continuous attributes +print("---Normalizing continuous attributes---") +scaler = MinMaxScaler() +input_data[columns_with_continuous_attributes] = scaler.fit_transform(input_data[columns_with_continuous_attributes]) + +# detecting the outliers based on columns with continuous attributes +Q1 = input_data[columns_with_continuous_attributes].quantile(0.25) +Q3 = input_data[columns_with_continuous_attributes].quantile(0.75) +IQR = Q3 - Q1 +lower_bound = Q1 - 2.5 * IQR +upper_bound = Q3 + 2.5 * IQR +found_outliers = ( (input_data[columns_with_continuous_attributes] < lower_bound) | (input_data[columns_with_continuous_attributes] > upper_bound) ).any(axis=1) +print("---Detected outliers based on continuous attributes---") +print(input_data[found_outliers]) + +color_mapping = { + "o": "red", + "n": "green" +} +ax = input_data.plot(x="TSH", y="T3_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, T3 markers \nand outlier classification') +plt.savefig('tsh_t3_plot.png') + +ax = input_data.plot(x="TSH", y="TT4_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, TT4 markers \nand outlier classification') +plt.savefig('tsh_tt4_plot.png') + +ax = input_data.plot(x="TSH", y="T4U_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, T4U markers \nand outlier classification') +plt.savefig('tsh_t4u_plot.png') + +ax = input_data.plot(x="TSH", y="FTI_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, FTI markers \nand outlier classification') +plt.savefig('tsh_fti_plot.png') + +dataset_cleaned = input_data[~found_outliers] + +ax = dataset_cleaned.plot(x="TSH", y="T3_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, T3 markers \n and outlier classification (after outliers cleaning)') +plt.savefig('tsh_t3_plot_outliers_removed.png') + +ax = dataset_cleaned.plot(x="TSH", y="TT4_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, TT4 markers \nand outlier classification (after outliers cleaning)') +plt.savefig('tsh_tt4_plot_outliers_removed.png') + +ax = dataset_cleaned.plot(x="TSH", y="T4U_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, T4U markers \nand outlier classification (after outliers cleaning)') +plt.savefig('tsh_t4u_plot_outliers_removed.png') + +ax = dataset_cleaned.plot(x="TSH", y="FTI_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis") +legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()] +ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)") +plt.title('Correlation between TSH, FTI markers \nand outlier classification (after outliers cleaning)') +plt.savefig('tsh_fti_plot_outliers_removed.png') + +# rate the outlier detection based on the Outlier_label column +false_positives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "n")]["Outlier_label"].count() +false_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "o"]["Outlier_label"].count() +true_postitives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "o")]["Outlier_label"].count() +true_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "n"]["Outlier_label"].count() +precision = true_negatives_count/(true_negatives_count + false_negatives_count) +recall = true_negatives_count/(true_negatives_count + false_positives_count) + +print(f""" +---Classification of the outlier detection--- +(we treat negative prediction as our target) +True positives: {true_postitives_count} +False positives: {false_positives_count} +True negatives: {true_negatives_count} +False negatives: {false_negatives_count} +Precision: {precision} +Recall: {recall} +""") \ No newline at end of file diff --git a/lab1/requirements.txt b/lab1/requirements.txt index 02317a7..e1f36f0 100644 --- a/lab1/requirements.txt +++ b/lab1/requirements.txt @@ -1,6 +1,19 @@ +contourpy==1.2.0 +cycler==0.12.1 +fonttools==4.45.0 +joblib==1.3.2 +kiwisolver==1.4.5 +matplotlib==3.8.2 numpy==1.26.2 +packaging==23.2 pandas==2.1.3 +Pillow==10.1.0 +pyparsing==3.1.1 python-dateutil==2.8.2 pytz==2023.3.post1 +scikit-learn==1.3.2 +scipy==1.11.4 six==1.16.0 +sklearn==0.0.post11 +threadpoolctl==3.2.0 tzdata==2023.3 diff --git a/lab1/tsh_fti_plot.png b/lab1/tsh_fti_plot.png new file mode 100644 index 0000000..7263944 Binary files /dev/null and b/lab1/tsh_fti_plot.png differ diff --git a/lab1/tsh_fti_plot_outliers_removed.png b/lab1/tsh_fti_plot_outliers_removed.png new file mode 100644 index 0000000..076fe4e Binary files /dev/null and b/lab1/tsh_fti_plot_outliers_removed.png differ diff --git a/lab1/tsh_t3_plot.png b/lab1/tsh_t3_plot.png new file mode 100644 index 0000000..5bfca99 Binary files /dev/null and b/lab1/tsh_t3_plot.png differ diff --git a/lab1/tsh_t3_plot_outliers_removed.png b/lab1/tsh_t3_plot_outliers_removed.png new file mode 100644 index 0000000..ee7c071 Binary files /dev/null and b/lab1/tsh_t3_plot_outliers_removed.png differ diff --git a/lab1/tsh_t4u_plot.png b/lab1/tsh_t4u_plot.png new file mode 100644 index 0000000..7b916be Binary files /dev/null and b/lab1/tsh_t4u_plot.png differ diff --git a/lab1/tsh_t4u_plot_outliers_removed.png b/lab1/tsh_t4u_plot_outliers_removed.png new file mode 100644 index 0000000..9529b78 Binary files /dev/null and b/lab1/tsh_t4u_plot_outliers_removed.png differ diff --git a/lab1/tsh_tt4_plot.png b/lab1/tsh_tt4_plot.png new file mode 100644 index 0000000..1acbe50 Binary files /dev/null and b/lab1/tsh_tt4_plot.png differ diff --git a/lab1/tsh_tt4_plot_outliers_removed.png b/lab1/tsh_tt4_plot_outliers_removed.png new file mode 100644 index 0000000..a96b04a Binary files /dev/null and b/lab1/tsh_tt4_plot_outliers_removed.png differ