adwb/lab1/preprocess.py

import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

input_data = pd.read_csv("annthyroid_unsupervised_anomaly_detection.csv", sep=";")

print(input_data.head(10))
# deal with missing values

# print all rows with missing values
print("---All rows with missing values---")
print(input_data[ input_data.isna().any(axis=1) ])

# Check for missing values in each column
missing_values = input_data.isnull().sum()

# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0].index

# Print columns with missing values
print("---Columns with missing values---")
print(columns_with_missing_values)

print("Filling column 'on_antithyroid_medication' with mode of this column")
input_data["on_antithyroid_medication"] = input_data["on_antithyroid_medication"].fillna(input_data["on_antithyroid_medication"].mode().iloc[0])

print("Filling column 'pregnant' with mode of this column")
input_data["pregnant"] = input_data["pregnant"].fillna(input_data["pregnant"].mode().iloc[0])

print("Filling column 'thyroid_surgery' with mode of this column")
input_data["thyroid_surgery"] = input_data["thyroid_surgery"].fillna(input_data["thyroid_surgery"].mode().iloc[0])

print("Filling column 'query_hypothyroid' with mode of this column")
input_data["query_hypothyroid"] = input_data["query_hypothyroid"].fillna(input_data["query_hypothyroid"].mode().iloc[0])

print("Filling column 'query_hyperthyroid' with mode of this column")
input_data["query_hyperthyroid"] = input_data["query_hyperthyroid"].fillna(input_data["query_hyperthyroid"].mode().iloc[0])

print("Filling column 'lithium' with mode of this column")
input_data["lithium"] = input_data["lithium"].fillna(input_data["lithium"].mode().iloc[0])

print("Filling column 'TSH' with median of this column")
input_data["TSH"] = input_data["TSH"].fillna(input_data["TSH"].median())

print("Filling column 'T3_measured' with median of this column")
input_data["T3_measured"] = input_data["T3_measured"].fillna(input_data["T3_measured"].median())

print("Filling column 'TT4_measured' with median of this column")
input_data["TT4_measured"] = input_data["TT4_measured"].fillna(input_data["TT4_measured"].median())

print("Filling column 'T4U_measured' with median of this column")
input_data["T4U_measured"] = input_data["T4U_measured"].fillna(input_data["T4U_measured"].median())

print("Filling column 'FTI_measured' with median of this column")
input_data["FTI_measured"] = input_data["FTI_measured"].fillna(input_data["FTI_measured"].median())

print("---Checking after filling N/A values---")
# Check for missing values in each column
missing_values = input_data.isnull().sum()

# Filter columns with missing values
columns_with_missing_values = missing_values[missing_values > 0].index

# Print columns with missing values
print("---Columns with missing values (after filling)---")
print(columns_with_missing_values)

# print all rows with missing values
print("---Rows with missing values (after filling)---")
print(input_data[ input_data.isna().any(axis=1) ])

columns_with_continuous_attributes = ["TSH", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"]

# normalize columns with continuous attributes
print("---Normalizing continuous attributes---")
scaler = MinMaxScaler()
input_data[columns_with_continuous_attributes] = scaler.fit_transform(input_data[columns_with_continuous_attributes])

# detecting the outliers based on columns with continuous attributes
Q1 = input_data[columns_with_continuous_attributes].quantile(0.25)
Q3 = input_data[columns_with_continuous_attributes].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 2.5 * IQR
upper_bound = Q3 + 2.5 * IQR
found_outliers = ( (input_data[columns_with_continuous_attributes] < lower_bound) | (input_data[columns_with_continuous_attributes] > upper_bound) ).any(axis=1)
print("---Detected outliers based on continuous attributes---")
print(input_data[found_outliers])

color_mapping = {
    "o": "red",
    "n": "green"
}
ax = input_data.plot(x="TSH", y="T3_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T3 markers \nand outlier classification')
plt.savefig('tsh_t3_plot.png')

ax = input_data.plot(x="TSH", y="TT4_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, TT4 markers \nand outlier classification')
plt.savefig('tsh_tt4_plot.png')

ax = input_data.plot(x="TSH", y="T4U_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T4U markers \nand outlier classification')
plt.savefig('tsh_t4u_plot.png')

ax = input_data.plot(x="TSH", y="FTI_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, FTI markers \nand outlier classification')
plt.savefig('tsh_fti_plot.png')

dataset_cleaned = input_data[~found_outliers]

ax = dataset_cleaned.plot(x="TSH", y="T3_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T3 markers \n and outlier classification (after outliers cleaning)')
plt.savefig('tsh_t3_plot_outliers_removed.png')

ax = dataset_cleaned.plot(x="TSH", y="TT4_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, TT4 markers \nand outlier classification (after outliers cleaning)')
plt.savefig('tsh_tt4_plot_outliers_removed.png')

ax = dataset_cleaned.plot(x="TSH", y="T4U_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, T4U markers \nand outlier classification (after outliers cleaning)')
plt.savefig('tsh_t4u_plot_outliers_removed.png')

ax = dataset_cleaned.plot(x="TSH", y="FTI_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")
legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]
ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")
plt.title('Correlation between TSH, FTI markers \nand outlier classification (after outliers cleaning)')
plt.savefig('tsh_fti_plot_outliers_removed.png')

# rate the outlier detection based on the Outlier_label column
false_positives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "n")]["Outlier_label"].count()
false_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "o"]["Outlier_label"].count()
true_postitives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "o")]["Outlier_label"].count()
true_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "n"]["Outlier_label"].count()
precision = true_negatives_count/(true_negatives_count + false_negatives_count)
recall = true_negatives_count/(true_negatives_count + false_positives_count)

print(f"""
---Classification of the outlier detection---
(we treat negative prediction as our target) 
True positives: {true_postitives_count}
False positives: {false_positives_count}
True negatives: {true_negatives_count}
False negatives: {false_negatives_count}
Precision: {precision}
Recall: {recall}
""")
WIP 10 months ago			`import pandas as pd`
WIP 10 months ago			`from sklearn.preprocessing import MinMaxScaler`
			`import matplotlib.pyplot as plt`
WIP 10 months ago
			`input_data = pd.read_csv("annthyroid_unsupervised_anomaly_detection.csv", sep=";")`

			`print(input_data.head(10))`
			`# deal with missing values`
WIP 10 months ago
			`# print all rows with missing values`
			`print("---All rows with missing values---")`
			`print(input_data[ input_data.isna().any(axis=1) ])`

			`# Check for missing values in each column`
			`missing_values = input_data.isnull().sum()`

			`# Filter columns with missing values`
			`columns_with_missing_values = missing_values[missing_values > 0].index`

			`# Print columns with missing values`
			`print("---Columns with missing values---")`
			`print(columns_with_missing_values)`

			`print("Filling column 'on_antithyroid_medication' with mode of this column")`
			`input_data["on_antithyroid_medication"] = input_data["on_antithyroid_medication"].fillna(input_data["on_antithyroid_medication"].mode().iloc[0])`

			`print("Filling column 'pregnant' with mode of this column")`
			`input_data["pregnant"] = input_data["pregnant"].fillna(input_data["pregnant"].mode().iloc[0])`

			`print("Filling column 'thyroid_surgery' with mode of this column")`
			`input_data["thyroid_surgery"] = input_data["thyroid_surgery"].fillna(input_data["thyroid_surgery"].mode().iloc[0])`

			`print("Filling column 'query_hypothyroid' with mode of this column")`
			`input_data["query_hypothyroid"] = input_data["query_hypothyroid"].fillna(input_data["query_hypothyroid"].mode().iloc[0])`

			`print("Filling column 'query_hyperthyroid' with mode of this column")`
			`input_data["query_hyperthyroid"] = input_data["query_hyperthyroid"].fillna(input_data["query_hyperthyroid"].mode().iloc[0])`

			`print("Filling column 'lithium' with mode of this column")`
			`input_data["lithium"] = input_data["lithium"].fillna(input_data["lithium"].mode().iloc[0])`

			`print("Filling column 'TSH' with median of this column")`
			`input_data["TSH"] = input_data["TSH"].fillna(input_data["TSH"].median())`

			`print("Filling column 'T3_measured' with median of this column")`
			`input_data["T3_measured"] = input_data["T3_measured"].fillna(input_data["T3_measured"].median())`

			`print("Filling column 'TT4_measured' with median of this column")`
			`input_data["TT4_measured"] = input_data["TT4_measured"].fillna(input_data["TT4_measured"].median())`

			`print("Filling column 'T4U_measured' with median of this column")`
			`input_data["T4U_measured"] = input_data["T4U_measured"].fillna(input_data["T4U_measured"].median())`

			`print("Filling column 'FTI_measured' with median of this column")`
			`input_data["FTI_measured"] = input_data["FTI_measured"].fillna(input_data["FTI_measured"].median())`

			`print("---Checking after filling N/A values---")`
			`# Check for missing values in each column`
			`missing_values = input_data.isnull().sum()`

			`# Filter columns with missing values`
			`columns_with_missing_values = missing_values[missing_values > 0].index`

			`# Print columns with missing values`
			`print("---Columns with missing values (after filling)---")`
			`print(columns_with_missing_values)`

			`# print all rows with missing values`
			`print("---Rows with missing values (after filling)---")`
			`print(input_data[ input_data.isna().any(axis=1) ])`

			`columns_with_continuous_attributes = ["TSH", "T3_measured", "TT4_measured", "T4U_measured", "FTI_measured"]`

			`# normalize columns with continuous attributes`
			`print("---Normalizing continuous attributes---")`
			`scaler = MinMaxScaler()`
			`input_data[columns_with_continuous_attributes] = scaler.fit_transform(input_data[columns_with_continuous_attributes])`

			`# detecting the outliers based on columns with continuous attributes`
			`Q1 = input_data[columns_with_continuous_attributes].quantile(0.25)`
			`Q3 = input_data[columns_with_continuous_attributes].quantile(0.75)`
			`IQR = Q3 - Q1`
			`lower_bound = Q1 - 2.5 * IQR`
			`upper_bound = Q3 + 2.5 * IQR`
			`found_outliers = ( (input_data[columns_with_continuous_attributes] < lower_bound) \| (input_data[columns_with_continuous_attributes] > upper_bound) ).any(axis=1)`
			`print("---Detected outliers based on continuous attributes---")`
			`print(input_data[found_outliers])`

			`color_mapping = {`
			`"o": "red",`
			`"n": "green"`
			`}`
			`ax = input_data.plot(x="TSH", y="T3_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, T3 markers \nand outlier classification')`
			`plt.savefig('tsh_t3_plot.png')`

			`ax = input_data.plot(x="TSH", y="TT4_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, TT4 markers \nand outlier classification')`
			`plt.savefig('tsh_tt4_plot.png')`

			`ax = input_data.plot(x="TSH", y="T4U_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, T4U markers \nand outlier classification')`
			`plt.savefig('tsh_t4u_plot.png')`

			`ax = input_data.plot(x="TSH", y="FTI_measured", kind="scatter", c=input_data["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, FTI markers \nand outlier classification')`
			`plt.savefig('tsh_fti_plot.png')`

			`dataset_cleaned = input_data[~found_outliers]`

			`ax = dataset_cleaned.plot(x="TSH", y="T3_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, T3 markers \n and outlier classification (after outliers cleaning)')`
			`plt.savefig('tsh_t3_plot_outliers_removed.png')`

			`ax = dataset_cleaned.plot(x="TSH", y="TT4_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, TT4 markers \nand outlier classification (after outliers cleaning)')`
			`plt.savefig('tsh_tt4_plot_outliers_removed.png')`

			`ax = dataset_cleaned.plot(x="TSH", y="T4U_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, T4U markers \nand outlier classification (after outliers cleaning)')`
			`plt.savefig('tsh_t4u_plot_outliers_removed.png')`

			`ax = dataset_cleaned.plot(x="TSH", y="FTI_measured", kind="scatter", c=dataset_cleaned["Outlier_label"].map(color_mapping), colormap="viridis")`
			`legend_config = [plt.Line2D([0], [0], marker="o", color="w", markerfacecolor=color, markersize=10, label=label) for label, color in color_mapping.items()]`
			`ax.legend(handles=legend_config, title="Categories: is outlier? \n(o - outlier, n - not an outlier)")`
			`plt.title('Correlation between TSH, FTI markers \nand outlier classification (after outliers cleaning)')`
			`plt.savefig('tsh_fti_plot_outliers_removed.png')`

			`# rate the outlier detection based on the Outlier_label column`
			`false_positives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "n")]["Outlier_label"].count()`
			`false_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "o"]["Outlier_label"].count()`
			`true_postitives_count = input_data[(found_outliers) & (input_data["Outlier_label"] == "o")]["Outlier_label"].count()`
			`true_negatives_count = dataset_cleaned[dataset_cleaned["Outlier_label"] == "n"]["Outlier_label"].count()`
			`precision = true_negatives_count/(true_negatives_count + false_negatives_count)`
			`recall = true_negatives_count/(true_negatives_count + false_positives_count)`

			`print(f"""`
			`---Classification of the outlier detection---`
			`(we treat negative prediction as our target)`
			`True positives: {true_postitives_count}`
			`False positives: {false_positives_count}`
			`True negatives: {true_negatives_count}`
			`False negatives: {false_negatives_count}`
			`Precision: {precision}`
			`Recall: {recall}`
			`""")`