WIP
BIN
lab1/dataset_plot.png
Normal file
After Width: | Height: | Size: 36 KiB |
@ -1,8 +1,160 @@
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.preprocessing import MinMaxScaler
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
input_data = pd.read_csv("annthyroid_unsupervised_anomaly_detection.csv", sep=";")
|
||||
|
||||
print(input_data.head(10))
|
||||
# deal with missing values
|
||||
|
||||
# print all rows with missing values
|
||||
print("---All rows with missing values---")
|
||||
print(input_data[ input_data.isna().any(axis=1) ])
|
||||
|
||||
# Check for missing values in each column
|
||||
missing_values = input_data.isnull().sum()
|
||||
|
||||
# Filter columns with missing values
|
||||
columns_with_missing_values = missing_values[missing_values > 0].index
|
||||
|
||||
# Print columns with missing values
|
||||
print("---Columns with missing values---")
|
||||
print(columns_with_missing_values)
|
||||
|
||||
print("Filling column 'on_antithyroid_medication' with mode of this column")
|
||||
input_data["on_antithyroid_medication"] = input_data["on_antithyroid_medication"].fillna(input_data["on_antithyroid_medication"].mode().iloc[0])
|
||||
|
||||
print("Filling column 'pregnant' with mode of this column")
|
||||
input_data["pregnant"] = input_data["pregnant"].fillna(input_data["pregnant"].mode().iloc[0])
|
||||
|
||||
print("Filling column 'thyroid_surgery' with mode of this column")
|
||||
input_data["thyroid_surgery"] = input_data["thyroid_surgery"].fillna(input_data["thyroid_surgery"].mode().iloc[0])
|
||||
|
||||
print("Filling column 'query_hypothyroid' with mode of this column")
|
||||
input_data["query_hypothyroid"] = input_data["query_hypothyroid"].fillna(input_data["query_hypothyroid"].mode().iloc[0])
|
||||
|
||||
print("Filling column 'query_hyperthyroid' with mode of this column")
|
||||
input_data["query_hyperthyroid"] = input_data["query_hyperthyroid"].fillna(input_data["query_hyperthyroid"].mode().iloc[0])
|
||||
|
||||
print("Filling column 'lithium' with mode of this column")
|
||||
input_data["lithium"] = input_data["lithium"].fillna(input_data["lithium"].mode().iloc[0])
|
||||
|
||||
print("Filling column 'TSH' with median of this column")
|
||||
input_data["TSH"] = input_data["TSH"].fillna(input_data["TSH"].median())
|
||||
|
||||
print("Filling column 'T3_measured' with median of this column")
|
||||
input_data["T3_measured"] = input_data["T3_measured"].fillna(input_data["T3_measured"].median())
|
||||
|
||||
print("Filling column 'TT4_measured' with median of this column")
|
||||
input_data["TT4_measured"] = input_data["TT4_measured"].fillna(input_data["TT4_measured"].median())
|
||||
|
||||
print("Filling column 'T4U_measured' with median of this column")
|
||||
input_data["T4U_measured"] = input_data["T4U_measured"].fillna(input_data["T4U_measured"].median())
|
||||
|
||||
print("Filling column 'FTI_measured' with median of this column")
|
||||
input_data["FTI_measured"] = input_data["FTI_measured"].fillna(input_data["FTI_measured"].median())
|
||||
|
||||
print("---Checking after filling N/A values---")
|
||||
# Check for missing values in each column
|
||||
missing_values = input_data.isnull().sum()
|
||||
|
||||
# Filter columns with missing values
|
||||
columns_with_missing_values = missing_values[missing_values > 0].index
|
||||
|
||||
# Print columns with missing values
|
||||
print("---Columns with missing values (after filling)---")
|
||||
print(columns_with_missing_values)
|
||||
|
||||
# print all rows with missing values
|
||||
print("---Rows with missing values (after filling)---")
|
||||
print(input_data[ input_data.isna().any(axis=1) ])
|
||||
|
||||
columns_with_continuous_attributes = ["TSH", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"]
|
||||
|
||||
# normalize columns with continuous attributes
|
||||
print("---Normalizing continuous attributes---")
|
||||
scaler = MinMaxScaler()
|
||||
input_data[columns_with_continuous_attributes] = scaler.fit_transform(input_data[columns_with_continuous_attributes])
|
||||
|
||||
# detecting the outliers based on columns with continuous attributes
|
||||
Q1 = input_data[columns_with_continuous_attributes].quantile(0.25)
|
||||
Q3 = input_data[columns_with_continuous_attributes].quantile(0.75)
|
||||
IQR = Q3 - Q1
|
||||
lower_bound = Q1 - 2.5 * IQR
|
||||
upper_bound = Q3 + 2.5 * IQR
|
||||
found_outliers = ( (input_data[columns_with_continuous_attributes] < lower_bound) | (input_data[columns_with_continuous_attributes] > upper_bound) ).any(axis=1)
|
||||
print("---Detected outliers based on continuous attributes---")
|
||||
print(input_data[found_outliers])
|
||||
|
||||
color_mapping = {
|
||||
"o": "red",
|
||||
"n": "green"
|
||||
}
|
||||
ax = input_data.plot(x="TSH", y="T3_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, T3 markers \nand outlier classification')
|
||||
plt.savefig('tsh_t3_plot.png')
|
||||
|
||||
ax = input_data.plot(x="TSH", y="TT4_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, TT4 markers \nand outlier classification')
|
||||
plt.savefig('tsh_tt4_plot.png')
|
||||
|
||||
ax = input_data.plot(x="TSH", y="T4U_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, T4U markers \nand outlier classification')
|
||||
plt.savefig('tsh_t4u_plot.png')
|
||||
|
||||
ax = input_data.plot(x="TSH", y="FTI_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, FTI markers \nand outlier classification')
|
||||
plt.savefig('tsh_fti_plot.png')
|
||||
|
||||
dataset_cleaned = input_data[~found_outliers]
|
||||
|
||||
ax = dataset_cleaned.plot(x="TSH", y="T3_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, T3 markers \n and outlier classification (after outliers cleaning)')
|
||||
plt.savefig('tsh_t3_plot_outliers_removed.png')
|
||||
|
||||
ax = dataset_cleaned.plot(x="TSH", y="TT4_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, TT4 markers \nand outlier classification (after outliers cleaning)')
|
||||
plt.savefig('tsh_tt4_plot_outliers_removed.png')
|
||||
|
||||
ax = dataset_cleaned.plot(x="TSH", y="T4U_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, T4U markers \nand outlier classification (after outliers cleaning)')
|
||||
plt.savefig('tsh_t4u_plot_outliers_removed.png')
|
||||
|
||||
ax = dataset_cleaned.plot(x="TSH", y="FTI_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
|
||||
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
|
||||
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
|
||||
plt.title('Correlation between TSH, FTI markers \nand outlier classification (after outliers cleaning)')
|
||||
plt.savefig('tsh_fti_plot_outliers_removed.png')
|
||||
|
||||
# rate the outlier detection based on the Outlier_label column
|
||||
false_positives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "n")]["Outlier_label"].count()
|
||||
false_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "o"]["Outlier_label"].count()
|
||||
true_postitives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "o")]["Outlier_label"].count()
|
||||
true_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "n"]["Outlier_label"].count()
|
||||
precision = true_negatives_count/(true_negatives_count + false_negatives_count)
|
||||
recall = true_negatives_count/(true_negatives_count + false_positives_count)
|
||||
|
||||
print(f"""
|
||||
---Classification of the outlier detection---
|
||||
(we treat negative prediction as our target)
|
||||
True positives: {true_postitives_count}
|
||||
False positives: {false_positives_count}
|
||||
True negatives: {true_negatives_count}
|
||||
False negatives: {false_negatives_count}
|
||||
Precision: {precision}
|
||||
Recall: {recall}
|
||||
""")
|
@ -1,6 +1,19 @@
|
||||
contourpy==1.2.0
|
||||
cycler==0.12.1
|
||||
fonttools==4.45.0
|
||||
joblib==1.3.2
|
||||
kiwisolver==1.4.5
|
||||
matplotlib==3.8.2
|
||||
numpy==1.26.2
|
||||
packaging==23.2
|
||||
pandas==2.1.3
|
||||
Pillow==10.1.0
|
||||
pyparsing==3.1.1
|
||||
python-dateutil==2.8.2
|
||||
pytz==2023.3.post1
|
||||
scikit-learn==1.3.2
|
||||
scipy==1.11.4
|
||||
six==1.16.0
|
||||
sklearn==0.0.post11
|
||||
threadpoolctl==3.2.0
|
||||
tzdata==2023.3
|
||||
|
BIN
lab1/tsh_fti_plot.png
Normal file
After Width: | Height: | Size: 39 KiB |
BIN
lab1/tsh_fti_plot_outliers_removed.png
Normal file
After Width: | Height: | Size: 103 KiB |
BIN
lab1/tsh_t3_plot.png
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
lab1/tsh_t3_plot_outliers_removed.png
Normal file
After Width: | Height: | Size: 89 KiB |
BIN
lab1/tsh_t4u_plot.png
Normal file
After Width: | Height: | Size: 41 KiB |
BIN
lab1/tsh_t4u_plot_outliers_removed.png
Normal file
After Width: | Height: | Size: 118 KiB |
BIN
lab1/tsh_tt4_plot.png
Normal file
After Width: | Height: | Size: 43 KiB |
BIN
lab1/tsh_tt4_plot_outliers_removed.png
Normal file
After Width: | Height: | Size: 106 KiB |