You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

160 lines
9.1 KiB
Python

10 months ago
import pandas as pd
10 months ago
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
10 months ago
input_data = pd.read_csv("annthyroid_unsupervised_anomaly_detection.csv", sep=";")
print(input_data.head(10))
# deal with missing values
10 months ago
# print all rows with missing values
print("---All rows with missing values---")
print(input_data[ input_data.isna().any(axis=1) ])
# Check for missing values in each column
missing_values = input_data.isnull().sum()
# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0].index
# Print columns with missing values
print("---Columns with missing values---")
print(columns_with_missing_values)
print("Filling column 'on_antithyroid_medication' with mode of this column")
input_data["on_antithyroid_medication"] = input_data["on_antithyroid_medication"].fillna(input_data["on_antithyroid_medication"].mode().iloc[0])
print("Filling column 'pregnant' with mode of this column")
input_data["pregnant"] = input_data["pregnant"].fillna(input_data["pregnant"].mode().iloc[0])
print("Filling column 'thyroid_surgery' with mode of this column")
input_data["thyroid_surgery"] = input_data["thyroid_surgery"].fillna(input_data["thyroid_surgery"].mode().iloc[0])
print("Filling column 'query_hypothyroid' with mode of this column")
input_data["query_hypothyroid"] = input_data["query_hypothyroid"].fillna(input_data["query_hypothyroid"].mode().iloc[0])
print("Filling column 'query_hyperthyroid' with mode of this column")
input_data["query_hyperthyroid"] = input_data["query_hyperthyroid"].fillna(input_data["query_hyperthyroid"].mode().iloc[0])
print("Filling column 'lithium' with mode of this column")
input_data["lithium"] = input_data["lithium"].fillna(input_data["lithium"].mode().iloc[0])
print("Filling column 'TSH' with median of this column")
input_data["TSH"] = input_data["TSH"].fillna(input_data["TSH"].median())
print("Filling column 'T3_measured' with median of this column")
input_data["T3_measured"] = input_data["T3_measured"].fillna(input_data["T3_measured"].median())
print("Filling column 'TT4_measured' with median of this column")
input_data["TT4_measured"] = input_data["TT4_measured"].fillna(input_data["TT4_measured"].median())
print("Filling column 'T4U_measured' with median of this column")
input_data["T4U_measured"] = input_data["T4U_measured"].fillna(input_data["T4U_measured"].median())
print("Filling column 'FTI_measured' with median of this column")
input_data["FTI_measured"] = input_data["FTI_measured"].fillna(input_data["FTI_measured"].median())
print("---Checking after filling N/A values---")
# Check for missing values in each column
missing_values = input_data.isnull().sum()
# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0].index
# Print columns with missing values
print("---Columns with missing values (after filling)---")
print(columns_with_missing_values)
# print all rows with missing values
print("---Rows with missing values (after filling)---")
print(input_data[ input_data.isna().any(axis=1) ])
columns_with_continuous_attributes = ["TSH", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"]
# normalize columns with continuous attributes
print("---Normalizing continuous attributes---")
scaler = MinMaxScaler()
input_data[columns_with_continuous_attributes] = scaler.fit_transform(input_data[columns_with_continuous_attributes])
# detecting the outliers based on columns with continuous attributes
Q1 = input_data[columns_with_continuous_attributes].quantile(0.25)
Q3 = input_data[columns_with_continuous_attributes].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 2.5 * IQR
upper_bound = Q3 + 2.5 * IQR
found_outliers = ( (input_data[columns_with_continuous_attributes] < lower_bound) | (input_data[columns_with_continuous_attributes] > upper_bound) ).any(axis=1)
print("---Detected outliers based on continuous attributes---")
print(input_data[found_outliers])
color_mapping = {
"o": "red",
"n": "green"
}
ax = input_data.plot(x="TSH", y="T3_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T3 markers \nand outlier classification')
plt.savefig('tsh_t3_plot.png')
ax = input_data.plot(x="TSH", y="TT4_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, TT4 markers \nand outlier classification')
plt.savefig('tsh_tt4_plot.png')
ax = input_data.plot(x="TSH", y="T4U_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T4U markers \nand outlier classification')
plt.savefig('tsh_t4u_plot.png')
ax = input_data.plot(x="TSH", y="FTI_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, FTI markers \nand outlier classification')
plt.savefig('tsh_fti_plot.png')
dataset_cleaned = input_data[~found_outliers]
ax = dataset_cleaned.plot(x="TSH", y="T3_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T3 markers \n and outlier classification (after outliers cleaning)')
plt.savefig('tsh_t3_plot_outliers_removed.png')
ax = dataset_cleaned.plot(x="TSH", y="TT4_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, TT4 markers \nand outlier classification (after outliers cleaning)')
plt.savefig('tsh_tt4_plot_outliers_removed.png')
ax = dataset_cleaned.plot(x="TSH", y="T4U_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T4U markers \nand outlier classification (after outliers cleaning)')
plt.savefig('tsh_t4u_plot_outliers_removed.png')
ax = dataset_cleaned.plot(x="TSH", y="FTI_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, FTI markers \nand outlier classification (after outliers cleaning)')
plt.savefig('tsh_fti_plot_outliers_removed.png')
# rate the outlier detection based on the Outlier_label column
false_positives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "n")]["Outlier_label"].count()
false_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "o"]["Outlier_label"].count()
true_postitives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "o")]["Outlier_label"].count()
true_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "n"]["Outlier_label"].count()
precision = true_negatives_count/(true_negatives_count + false_negatives_count)
recall = true_negatives_count/(true_negatives_count + false_positives_count)
print(f"""
---Classification of the outlier detection---
(we treat negative prediction as our target)
True positives: {true_postitives_count}
False positives: {false_positives_count}
True negatives: {true_negatives_count}
False negatives: {false_negatives_count}
Precision: {precision}
Recall: {recall}
""")