climateready_heatwave_survey.py

# -*- coding: utf-8 -*-
"""climateready-heatwave-survey.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/16f6Kk54hmL4Gk0FAKd3knuRWVjw5ELjp

# **CLIMATEREADY Thermal Comfort Survey Amid Heatwaves Dataset**

This CLIMATEREADY survey dataset contains thermal comfort votes during the 2021 and 2022 heatwave periods in Pamplona, Spain, as well as other relevant parameters self-reported by surveyees (e.g. occupant characteristics and behaviour, key building/dwelling characteristics), used as case study for the research paper **Exploring indoor thermal comfort and its causes and consequences amid heatwaves in a Southern European city— An unsupervised learning approach**.

This dataset is part of the [CLIMATEREADY research project](https://experience.arcgis.com/experience/a85fb262378b49dc87381261a2e53c91). Its goal is to assess the adaptability of Spanish homes to global warming, promote passive measures in the design and use of buildings to achieve adequate indoor environmental conditions in summer conditions and in heat wave events, minimise and quantify the risks of overheating with the minimum energy demand for cooling when conditions make it necessary.
"""

import sklearn
print(sklearn.__version__)

"""## **Load libraries**"""

#Cargar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

"""## **Connection to Google Drive**"""

# Conexión con Google Drive
from google.colab import drive
drive.mount('/content/drive')

"""## **Loading and inspecting data**"""

dataset = "/content/drive/My Drive/Post-Doc DATAI/Colaboraciones de investigacion/CLIMATEREADY/encuestas/paper/final draft/climateready-heatwave-survey.csv"

# Read Excel file
merge = pd.read_csv(dataset, sep=";",
                      encoding='latin-1') # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 1: invalid continuation byte. SOLVED: encoding='latin-1'

merge

merge.columns

"""## **Research sub-question 1**

### **Normalized rank**
"""

from scipy.stats import rankdata

merge['TSen_day_NR'] = (rankdata(merge['TSen_day']) - 1) / (len(merge['TSen_day']) - 1)
merge['TSen_night_NR'] = (rankdata(merge['TSen_night']) - 1) / (len(merge['TSen_night']) - 1)

merge['TSatisf_day_NR'] = (rankdata(merge['TSatisf_day']) - 1) / (len(merge['TSatisf_day']) - 1)
merge['TSatisf_night_NR'] = (rankdata(merge['TSatisf_night']) - 1) / (len(merge['TSatisf_night']) - 1)

merge['TPref_day_NR'] = (rankdata(merge['TPref_day']) - 1) / (len(merge['TPref_day']) - 1)
merge['TPref_night_NR'] = (rankdata(merge['TPref_night']) - 1) / (len(merge['TPref_night']) - 1)

"""### **KMeans**


"""

kmeans_df = merge[['TSen_day_NR', "TSatisf_day_NR","TPref_day_NR",'TSen_night_NR',"TSatisf_night_NR","TPref_night_NR"]]
kmeans_df.reset_index(drop=True)
kmeans_df = kmeans_df.astype(float)
kmeans_df.info()

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Silhouette method for K-means
silhouette_scores_kmeans = []
K = range(2, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(kmeans_df)
    score = silhouette_score(kmeans_df, labels)
    silhouette_scores_kmeans.append(score)
    print(f"Silhouette score for K-means with {k} clusters: {score}")

# Plot the silhouette scores for K-means
plt.figure(figsize=(5, 5))
plt.plot(K, silhouette_scores_kmeans, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method For Optimal k (K-means)')
plt.show()

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
merge['cluster_kmeans'] = kmeans.fit_predict(kmeans_df)

merge.groupby("cluster_kmeans").agg("count")["ID"]

# merge['cluster_kmeans'] = merge['cluster_kmeans'].replace({1: 0, 0: 1})

merge.groupby("cluster_kmeans").agg("count")["ID"]

# Select only numeric columns
numeric_columns = merge.select_dtypes(include=[np.number])

numeric_columns.groupby("cluster_kmeans").agg("median")["TPref_night"]

merge[["ID","cluster_kmeans"]].to_csv('cluster_kmeans_enpunto.csv', sep=';', index=False)

"""#### **Summary: Sensation**


"""

TSen_day_cluster0 = merge[merge["cluster_kmeans"] == 0].groupby("TSen_day").agg("count")[["ID"]]
TSen_day_cluster1 = merge[merge["cluster_kmeans"] == 1].groupby("TSen_day").agg("count")[["ID"]]

TSen_day_cluster0 = TSen_day_cluster0.transpose()
TSen_day_cluster1 = TSen_day_cluster1.transpose()

TSen_day_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSen_day_cluster0.index])
TSen_day_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSen_day_cluster1.index])

TSen_day_summary = pd.concat([TSen_day_cluster0, TSen_day_cluster1], axis=0)
TSen_day_summary = TSen_day_summary.droplevel(1)
TSen_day_summary = TSen_day_summary.reset_index()
TSen_day_summary["time_of_the_day"] = "Day"
TSen_day_summary

TSen_night_cluster0 = merge[merge["cluster_kmeans"] == 0].groupby("TSen_night").agg("count")[["ID"]]
TSen_night_cluster1 = merge[merge["cluster_kmeans"] == 1].groupby("TSen_night").agg("count")[["ID"]]

TSen_night_cluster0 = TSen_night_cluster0.transpose()
TSen_night_cluster1 = TSen_night_cluster1.transpose()

TSen_night_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSen_night_cluster0.index])
TSen_night_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSen_night_cluster1.index])

TSen_night_summary = pd.concat([TSen_night_cluster0, TSen_night_cluster1], axis=0)
TSen_night_summary = TSen_night_summary.droplevel(1)
TSen_night_summary = TSen_night_summary.reset_index()
TSen_night_summary["time_of_the_day"] = "Night"
TSen_night_summary

TSen_summary = pd.concat([TSen_day_summary, TSen_night_summary], axis=0)
TSen_summary.to_csv('TSen_summary_kmeans.csv', sep=';', index=False)
TSen_summary

"""#### **Summary: Satisfaction**


"""

TSatis_day_cluster0 = merge[merge["cluster_kmeans"] == 0].groupby("TSatisf_day").agg("count")[["ID"]]
TSatis_day_cluster1 = merge[merge["cluster_kmeans"] == 1].groupby("TSatisf_day").agg("count")[["ID"]]

TSatis_day_cluster0 = TSatis_day_cluster0.transpose()
TSatis_day_cluster1 = TSatis_day_cluster1.transpose()

TSatis_day_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSatis_day_cluster0.index])
TSatis_day_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSatis_day_cluster1.index])

TSatis_day_summary = pd.concat([TSatis_day_cluster0, TSatis_day_cluster1], axis=0)
TSatis_day_summary = TSatis_day_summary.droplevel(1)
TSatis_day_summary = TSatis_day_summary.reset_index()
TSatis_day_summary["time_of_the_day"] = "Day"
TSatis_day_summary

TSatis_night_cluster0 = merge[merge["cluster_kmeans"] == 0].groupby("TSatisf_night").agg("count")[["ID"]]
TSatis_night_cluster1 = merge[merge["cluster_kmeans"] == 1].groupby("TSatisf_night").agg("count")[["ID"]]

TSatis_night_cluster0 = TSatis_night_cluster0.transpose()
TSatis_night_cluster1 = TSatis_night_cluster1.transpose()

TSatis_night_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSatis_night_cluster0.index])
TSatis_night_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSatis_night_cluster1.index])

TSatis_night_summary = pd.concat([TSatis_night_cluster0, TSatis_night_cluster1], axis=0)
TSatis_night_summary = TSatis_night_summary.droplevel(1)
TSatis_night_summary = TSatis_night_summary.reset_index()
TSatis_night_summary["time_of_the_day"] = "Night"
TSatis_night_summary

TSatis_summary = pd.concat([TSatis_day_summary, TSatis_night_summary], axis=0)
TSatis_summary.to_csv('TSatis_summary_kmeans.csv', sep=';', index=False)
TSatis_summary

"""#### **Summary: Preference**


"""

TPref_day_cluster0 = merge[merge["cluster_kmeans"] == 0].groupby("TPref_day").agg("count")[["ID"]]
TPref_day_cluster1 = merge[merge["cluster_kmeans"] == 1].groupby("TPref_day").agg("count")[["ID"]]

TPref_day_cluster0 = TPref_day_cluster0.transpose()
TPref_day_cluster1 = TPref_day_cluster1.transpose()

TPref_day_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TPref_day_cluster0.index])
TPref_day_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TPref_day_cluster1.index])

TPref_day_summary = pd.concat([TPref_day_cluster0, TPref_day_cluster1], axis=0)
TPref_day_summary = TPref_day_summary.droplevel(1)
TPref_day_summary = TPref_day_summary.reset_index()
TPref_day_summary["time_of_the_day"] = "Day"
TPref_day_summary

TPref_night_cluster0 = merge[merge["cluster_kmeans"] == 0].groupby("TPref_night").agg("count")[["ID"]]
TPref_night_cluster1 = merge[merge["cluster_kmeans"] == 1].groupby("TPref_night").agg("count")[["ID"]]

TPref_night_cluster0 = TPref_night_cluster0.transpose()
TPref_night_cluster1 = TPref_night_cluster1.transpose()

TPref_night_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TPref_night_cluster0.index])
TPref_night_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TPref_night_cluster1.index])

TPref_night_summary = pd.concat([TPref_night_cluster0, TPref_night_cluster1], axis=0)
TPref_night_summary = TPref_night_summary.droplevel(1)
TPref_night_summary = TPref_night_summary.reset_index()
TPref_night_summary["time_of_the_day"] = "Night"
TPref_night_summary

TPref_summary = pd.concat([TPref_day_summary, TPref_night_summary], axis=0)
TPref_summary.to_csv('TPref_summary_kmeans.csv', sep=';', index=False)
TPref_summary

"""### **Hierarchical clustering**


"""

import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Your data
hclust_df = merge[['TSen_day_NR', 'TSatisf_day_NR', 'TPref_day_NR', 'TSen_night_NR', 'TSatisf_night_NR', 'TPref_night_NR']]
hclust_df.reset_index(drop=True, inplace=True)  # Make sure to use drop=True to avoid creating a new index
hclust_df = hclust_df.astype(float)

# Silhouette method for Hierarchical Clustering
silhouette_scores_hierarchical = []
for k in K:
    hierarchical_clustering = AgglomerativeClustering(n_clusters=k, metric='euclidean', linkage='complete')
    labels = hierarchical_clustering.fit_predict(hclust_df)
    score = silhouette_score(hclust_df, labels)
    silhouette_scores_hierarchical.append(score)
    print(f"Silhouette score for Hierarchical Clustering with {k} clusters: {score}")

# Plot the silhouette scores for hierarchical clustering
plt.figure(figsize=(5, 5))
plt.plot(K, silhouette_scores_hierarchical, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Method For Optimal k (Hierarchical Clustering)')
plt.show()

# Apply hierarchical clustering
n_clusters = 2  # Set the number of clusters
hierarchical_clustering = AgglomerativeClustering(n_clusters=n_clusters, metric='euclidean', linkage='complete') # average or complete
merge['cluster_hierarchical'] = hierarchical_clustering.fit_predict(hclust_df)

# Group by cluster and count
cluster_counts = merge.groupby('cluster_hierarchical').agg(count=('ID', 'count'))
print(cluster_counts)

merge['cluster_hierarchical'] = merge['cluster_hierarchical'].replace({1: 0, 0: 1})

merge.groupby("cluster_hierarchical").agg("count")["ID"]

# Select only numeric columns
numeric_columns = merge.select_dtypes(include=[np.number])

numeric_columns.groupby("cluster_hierarchical").agg("median")["TPref_night"]

merge[["ID","cluster_hierarchical"]].to_csv('cluster_hierarchical_enpunto.csv', sep=';', index=False)

"""##### **Summary: Sensation**


"""

TSen_day_cluster0 = merge[merge["cluster_hierarchical"] == 0].groupby("TSen_day").agg("count")[["ID"]]
TSen_day_cluster1 = merge[merge["cluster_hierarchical"] == 1].groupby("TSen_day").agg("count")[["ID"]]

TSen_day_cluster0 = TSen_day_cluster0.transpose()
TSen_day_cluster1 = TSen_day_cluster1.transpose()

TSen_day_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSen_day_cluster0.index])
TSen_day_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSen_day_cluster1.index])

TSen_day_summary = pd.concat([TSen_day_cluster0, TSen_day_cluster1], axis=0)
TSen_day_summary = TSen_day_summary.droplevel(1)
TSen_day_summary = TSen_day_summary.reset_index()
TSen_day_summary["time_of_the_day"] = "Day"
TSen_day_summary

TSen_night_cluster0 = merge[merge["cluster_hierarchical"] == 0].groupby("TSen_night").agg("count")[["ID"]]
TSen_night_cluster1 = merge[merge["cluster_hierarchical"] == 1].groupby("TSen_night").agg("count")[["ID"]]

TSen_night_cluster0 = TSen_night_cluster0.transpose()
TSen_night_cluster1 = TSen_night_cluster1.transpose()

TSen_night_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSen_night_cluster0.index])
TSen_night_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSen_night_cluster1.index])

TSen_night_summary = pd.concat([TSen_night_cluster0, TSen_night_cluster1], axis=0)
TSen_night_summary = TSen_night_summary.droplevel(1)
TSen_night_summary = TSen_night_summary.reset_index()
TSen_night_summary["time_of_the_day"] = "Night"
TSen_night_summary

TSen_summary = pd.concat([TSen_day_summary, TSen_night_summary], axis=0)
TSen_summary.to_csv('TSen_summary_hclust.csv', sep=';', index=False)
TSen_summary

"""##### **Summary: Satisfaction**


"""

TSatis_day_cluster0 = merge[merge["cluster_hierarchical"] == 0].groupby("TSatisf_day").agg("count")[["ID"]]
TSatis_day_cluster1 = merge[merge["cluster_hierarchical"] == 1].groupby("TSatisf_day").agg("count")[["ID"]]

TSatis_day_cluster0 = TSatis_day_cluster0.transpose()
TSatis_day_cluster1 = TSatis_day_cluster1.transpose()

TSatis_day_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSatis_day_cluster0.index])
TSatis_day_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSatis_day_cluster1.index])

TSatis_day_summary = pd.concat([TSatis_day_cluster0, TSatis_day_cluster1], axis=0)
TSatis_day_summary = TSatis_day_summary.droplevel(1)
TSatis_day_summary = TSatis_day_summary.reset_index()
TSatis_day_summary["time_of_the_day"] = "Day"
TSatis_day_summary

TSatis_night_cluster0 = merge[merge["cluster_hierarchical"] == 0].groupby("TSatisf_night").agg("count")[["ID"]]
TSatis_night_cluster1 = merge[merge["cluster_hierarchical"] == 1].groupby("TSatisf_night").agg("count")[["ID"]]

TSatis_night_cluster0 = TSatis_night_cluster0.transpose()
TSatis_night_cluster1 = TSatis_night_cluster1.transpose()

TSatis_night_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TSatis_night_cluster0.index])
TSatis_night_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TSatis_night_cluster1.index])

TSatis_night_summary = pd.concat([TSatis_night_cluster0, TSatis_night_cluster1], axis=0)
TSatis_night_summary = TSatis_night_summary.droplevel(1)
TSatis_night_summary = TSatis_night_summary.reset_index()
TSatis_night_summary["time_of_the_day"] = "Night"
TSatis_night_summary

TSatis_summary = pd.concat([TSatis_day_summary, TSatis_night_summary], axis=0)
TSatis_summary.to_csv('TSatis_summary_hclust.csv', sep=';', index=False)
TSatis_summary

"""##### **Summary: Preference**


"""

TPref_day_cluster0 = merge[merge["cluster_hierarchical"] == 0].groupby("TPref_day").agg("count")[["ID"]]
TPref_day_cluster1 = merge[merge["cluster_hierarchical"] == 1].groupby("TPref_day").agg("count")[["ID"]]

TPref_day_cluster0 = TPref_day_cluster0.transpose()
TPref_day_cluster1 = TPref_day_cluster1.transpose()

TPref_day_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TPref_day_cluster0.index])
TPref_day_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TPref_day_cluster1.index])

TPref_day_summary = pd.concat([TPref_day_cluster0, TPref_day_cluster1], axis=0)
TPref_day_summary = TPref_day_summary.droplevel(1)
TPref_day_summary = TPref_day_summary.reset_index()
TPref_day_summary["time_of_the_day"] = "Day"
TPref_day_summary

TPref_night_cluster0 = merge[merge["cluster_hierarchical"] == 0].groupby("TPref_night").agg("count")[["ID"]]
TPref_night_cluster1 = merge[merge["cluster_hierarchical"] == 1].groupby("TPref_night").agg("count")[["ID"]]

TPref_night_cluster0 = TPref_night_cluster0.transpose()
TPref_night_cluster1 = TPref_night_cluster1.transpose()

TPref_night_cluster0.index = pd.MultiIndex.from_product([['Comfortable'], TPref_night_cluster0.index])
TPref_night_cluster1.index = pd.MultiIndex.from_product([['Uncomfortable'], TPref_night_cluster1.index])

TPref_night_summary = pd.concat([TPref_night_cluster0, TPref_night_cluster1], axis=0)
TPref_night_summary = TPref_night_summary.droplevel(1)
TPref_night_summary = TPref_night_summary.reset_index()
TPref_night_summary["time_of_the_day"] = "Night"
TPref_night_summary

TPref_summary = pd.concat([TPref_day_summary, TPref_night_summary], axis=0)
TPref_summary.to_csv('TPref_summary_hclust.csv', sep=';', index=False)
TPref_summary

"""### **Venn Diagram**


"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['cluster_kmeans', 'cluster_hierarchical']]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=df_subset['cluster_kmeans'],
                               columns=df_subset['cluster_hierarchical'],
                               rownames=['K-Means'], colnames=['Hierarchical Clustering'])

# Print the confusion matrix-like table
print(confusion_matrix)

pip install matplotlib_venn

import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn2

kmeans_values = set(merge.loc[merge['cluster_kmeans'] == 1].index)
hierarchical_values = set(merge.loc[merge['cluster_hierarchical'] == 1].index)

# Calculate the intersection and set sizes
venn_labels = {'10': kmeans_values - hierarchical_values,
               '01': hierarchical_values - kmeans_values,
               '11': kmeans_values.intersection(hierarchical_values)}

# Create Venn diagram
venn_labels = venn2(subsets=(len(venn_labels['10']), len(venn_labels['01']), len(venn_labels['11'])),
                   set_labels=('KMeans', 'HierarchicalClustering'))
plt.savefig("venn_htd.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

kmeans_values = set(merge.loc[merge['cluster_kmeans'] == 0].index)
hierarchical_values = set(merge.loc[merge['cluster_hierarchical'] == 0].index)

# Calculate the intersection and set sizes
venn_labels = {'10': kmeans_values - hierarchical_values,
               '01': hierarchical_values - kmeans_values,
               '11': kmeans_values.intersection(hierarchical_values)}

# Create Venn diagram
venn_labels = venn2(subsets=(len(venn_labels['10']), len(venn_labels['01']), len(venn_labels['11'])),
                   set_labels=('KMeans', 'HierarchicalClustering'))
plt.savefig("venn_ltd.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

from sklearn.metrics import adjusted_rand_score

# Create your clustering results as 1D arrays
kmeans_array = merge["cluster_kmeans"].values
hierarchical_array = merge["cluster_hierarchical"].values

# Calculate ARI
ari = adjusted_rand_score(kmeans_array, hierarchical_array)

# Print the results
print(f'ARI for LTD: {ari}')

"""##### **Crosstab Sensation**


"""

import seaborn as sns

"""**K-Means**"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSen_day', 'TSen_night', 'cluster_kmeans']]

# Filter by the desired cluster_kmeans value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_kmeans'] == 1]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSen_night'],
                               columns=filtered_df['TSen_day'],
                               rownames=['TSen_night'], colnames=['TSen_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSen_day', 'TSen_night', 'cluster_kmeans']]

# Filter by the desired cluster_kmeans value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_kmeans'] == 0]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSen_night'],
                               columns=filtered_df['TSen_day'],
                               rownames=['TSen_night'], colnames=['TSen_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

"""**Hierarchical Clustering**"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSen_day', 'TSen_night', 'cluster_hierarchical']]

# Filter by the desired cluster_cluster_hierarchical value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_hierarchical'] == 1]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSen_night'],
                               columns=filtered_df['TSen_day'],
                               rownames=['TSen_night'], colnames=['TSen_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSen_day', 'TSen_night', 'cluster_hierarchical']]

# Filter by the desired cluster_cluster_hierarchical value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_hierarchical'] == 0]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSen_night'],
                               columns=filtered_df['TSen_day'],
                               rownames=['TSen_night'], colnames=['TSen_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

"""**Hstograms**"""

# List of filter conditions
  filters = ["cluster_kmeans", "cluster_hierarchical"]

  # Create subplots
  fig, axes = plt.subplots(1, 2, figsize=(10, 5))

  # Loop through filters and create subplots
  for i, filter_col in enumerate(filters):
      ax = axes[i]

      # Apply filter condition and plot
      data_subset = merge[merge[filter_col] == 1]
      sns.histplot(merge[merge[filter_col] == 1], x="TSen_day", y="TSen_night", legend=False, discrete=(True, True), ax=ax)

      ticks = range(-3, 4)
      ax.set_xticks(ticks)
      ax.set_yticks(ticks)

      ax.set_xlim(-4, 4)
      ax.set_ylim(-4, 4)

      # Set subplot title
      if filter_col == "cluster_kmeans":
          ax.set_title("K-Means")
          # Add text in the top left corner
          ax.text(-3.8, 3.8, "'Uncomfortable' cluster (n=97)", fontsize=10, ha='left', va='top', color='black')
      elif filter_col == "cluster_hierarchical":
          ax.set_title("Hierarchical Clustering")
          # Add text in the top left corner
          ax.text(-3.8, 3.8, "'Uncomfortable' cluster (n=113)", fontsize=10, ha='left', va='top', color='black')

      # Set x and y axis titles
      ax.set_xlabel("TSEN_day")
      ax.set_ylabel("TSEN_night")

      # Add frequency labels using annotate
      for (x, y), count in data_subset.groupby(["TSen_day", "TSen_night"]).size().items():
          ax.annotate(count, (x, y), textcoords="offset points", xytext=(0, 3), ha='center', fontsize=10, color='white')

# Adjust layout and show the plot
plt.tight_layout()
plt.savefig("TSen_HTD.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# List of filter conditions
filters = ["cluster_kmeans", "cluster_hierarchical"]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Loop through filters and create subplots
for i, filter_col in enumerate(filters):
    ax = axes[i]

    # Apply filter condition and plot
    data_subset = merge[merge[filter_col] == 0]
    sns.histplot(merge[merge[filter_col] == 0], x="TSen_day", y="TSen_night", legend=False, discrete=(True, True), ax=ax)

    ticks = range(-3, 4)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlim(-4, 4)
    ax.set_ylim(-4, 4)

    # Set subplot title
    if filter_col == "cluster_kmeans":
        ax.set_title("K-Means")
        # Add text in the top left corner
        ax.text(-3.8, 3.8, "'Comfortable' cluster (n=92)", fontsize=10, ha='left', va='top', color='black')
    elif filter_col == "cluster_hierarchical":
        ax.set_title("Hierarchical Clustering")
        # Add text in the top left corner
        ax.text(-3.8, 3.8, "'Comfortable' cluster (n=76)", fontsize=10, ha='left', va='top', color='black')

    # Set x and y axis titles
    ax.set_xlabel("TSEN_day")
    ax.set_ylabel("TSEN_night")

    # Add frequency labels using annotate
    for (x, y), count in data_subset.groupby(["TSen_day", "TSen_night"]).size().items():
        ax.annotate(count, (x, y), textcoords="offset points", xytext=(0, 3), ha='center', fontsize=10, color='white')

# Adjust layout and show the plot
plt.tight_layout()
plt.savefig("TSen_LTD.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

"""##### **Crosstab Satisfaction**

**K-Means**
"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSatisf_day', 'TSatisf_night', 'cluster_kmeans']]

# Filter by the desired cluster_kmeans value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_kmeans'] == 1]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSatisf_night'],
                               columns=filtered_df['TSatisf_day'],
                               rownames=['TSatisf_night'], colnames=['TSatisf_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSatisf_day', 'TSatisf_night', 'cluster_kmeans']]

# Filter by the desired cluster_kmeans value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_kmeans'] == 0]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSatisf_night'],
                               columns=filtered_df['TSatisf_day'],
                               rownames=['TSatisf_night'], colnames=['TSatisf_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

"""**Hierarchical Clustering**"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSatisf_day', 'TSatisf_night', 'cluster_hierarchical']]

# Filter by the desired cluster_hierarchical value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_hierarchical'] == 1]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSatisf_night'],
                               columns=filtered_df['TSatisf_day'],
                               rownames=['TSatisf_night'], colnames=['TSatisf_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# Create a DataFrame containing the relevant columns
df_subset = merge[['TSatisf_day', 'TSatisf_night', 'cluster_hierarchical']]

# Filter by the desired cluster_hierarchical value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_hierarchical'] == 0]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TSatisf_night'],
                               columns=filtered_df['TSatisf_day'],
                               rownames=['TSatisf_night'], colnames=['TSatisf_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# List of filter conditions
filters = ["cluster_kmeans", "cluster_hierarchical"]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Loop through filters and create subplots
for i, filter_col in enumerate(filters):
    ax = axes[i]

    # Apply filter condition and plot
    data_subset = merge[merge[filter_col] == 1]
    sns.histplot(merge[merge[filter_col] == 1], x="TSatisf_day", y="TSatisf_night", legend=False, discrete=(True, True), ax=ax)

    ticks = range(-3, 4)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlim(-4, 4)
    ax.set_ylim(-4, 4)

    # Set subplot title
    if filter_col == "cluster_kmeans":
        ax.set_title("K-Means")
        # Add text in the top left corner
        ax.text(-3.8, 3.8, "'Uncomfortable' cluster (n=97)", fontsize=10, ha='left', va='top', color='black')
    elif filter_col == "cluster_hierarchical":
        ax.set_title("Hierarchical Clustering")
        # Add text in the top left corner
        ax.text(-3.8, 3.8, "'Uncomfortable' cluster (n=113)", fontsize=10, ha='left', va='top', color='black')

    # Set x and y axis titles
    ax.set_xlabel("TSATIS_day")
    ax.set_ylabel("TSATIS_night")

    # Add frequency labels using annotate
    for (x, y), count in data_subset.groupby(["TSatisf_day", "TSatisf_night"]).size().items():
        ax.annotate(count, (x, y), textcoords="offset points", xytext=(0, 3), ha='center', fontsize=10, color='white')

# Adjust layout and show the plot
plt.tight_layout()
plt.savefig("TSatisf_HTD.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# List of filter conditions
filters = [ "cluster_kmeans", "cluster_hierarchical"]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Loop through filters and create subplots
for i, filter_col in enumerate(filters):
    ax = axes[i]

    # Apply filter condition and plot
    data_subset = merge[merge[filter_col] == 0]
    sns.histplot(merge[merge[filter_col] == 0], x="TSatisf_day", y="TSatisf_night", legend=False, discrete=(True, True), ax=ax)

    ticks = range(-3, 4)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlim(-4, 4)
    ax.set_ylim(-4, 4)

    # Set subplot title
    if filter_col == "cluster_kmeans":
        ax.set_title("K-Means")
        # Add text in the top left corner
        ax.text(-3.8, 3.8, "'Comfortable' cluster (n=92)", fontsize=10, ha='left', va='top', color='black')
    elif filter_col == "cluster_hierarchical":
        ax.set_title("Hierarchical Clustering")
        # Add text in the top left corner
        ax.text(-3.8, 3.8, "'Comfortable' cluster (n=76)", fontsize=10, ha='left', va='top', color='black')

    # Set x and y axis titles
    ax.set_xlabel("TSATIS_day")
    ax.set_ylabel("TSATIS_night")

    # Add frequency labels using annotate
    for (x, y), count in data_subset.groupby(["TSatisf_day", "TSatisf_night"]).size().items():
        ax.annotate(count, (x, y), textcoords="offset points", xytext=(0, 3), ha='center', fontsize=10, color='white')

# Adjust layout and show the plot
plt.tight_layout()
plt.savefig("TSatisf_LTD.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

"""##### **Crosstab Preference**

**K-Means**
"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['TPref_day', 'TPref_night', 'cluster_kmeans']]

# Filter by the desired cluster_kmeans value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_kmeans'] == 1]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TPref_night'],
                               columns=filtered_df['TPref_day'],
                               rownames=['TPref_night'], colnames=['TPref_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# Create a DataFrame containing the relevant columns
df_subset = merge[['TPref_day', 'TPref_night', 'cluster_kmeans']]

# Filter by the desired cluster_kmeans value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_kmeans'] == 0]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TPref_night'],
                               columns=filtered_df['TPref_day'],
                               rownames=['TPref_night'], colnames=['TPref_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

"""**Hierarchical**"""

# Create a DataFrame containing the relevant columns
df_subset = merge[['TPref_day', 'TPref_night', 'cluster_hierarchical']]

# Filter by the desired cluster_hierarchical value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_hierarchical'] == 1]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TPref_night'],
                               columns=filtered_df['TPref_day'],
                               rownames=['TPref_night'], colnames=['TPref_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# Create a DataFrame containing the relevant columns
df_subset = merge[['TPref_day', 'TPref_night', 'cluster_hierarchical']]

# Filter by the desired cluster_hierarchical value (replace 'desired_cluster_value' with the actual value you want to filter)
filtered_df = df_subset[df_subset['cluster_hierarchical'] == 0]

# Create a cross-tabulation (confusion matrix-like table)
confusion_matrix = pd.crosstab(index=filtered_df['TPref_night'],
                               columns=filtered_df['TPref_day'],
                               rownames=['TPref_night'], colnames=['TPref_day'])

# Print the confusion matrix-like table
print(confusion_matrix)

# List of filter conditions
filters = ["cluster_kmeans", "cluster_hierarchical"]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Loop through filters and create subplots
for i, filter_col in enumerate(filters):
    ax = axes[i]

    # Apply filter condition and plot
    data_subset = merge[merge[filter_col] == 1]
    sns.histplot(merge[merge[filter_col] == 1], x="TPref_day", y="TPref_night", legend=False, discrete=(True, True), ax=ax)

    ticks = range(-1, 2)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlim(-2, 2)
    ax.set_ylim(-2, 2)

    # Set subplot title
    if filter_col == "cluster_kmeans":
        ax.set_title("K-Means")
        # Add text in the top left corner
        ax.text(-1.8, 1.8, "'Uncomfortable' cluster (n=97)", fontsize=10, ha='left', va='top', color='black')
    elif filter_col == "cluster_hierarchical":
        ax.set_title("Hierarchical Clustering")
        # Add text in the top left corner
        ax.text(-1.8, 1.8, "'Uncomfortable' cluster (n=113)", fontsize=10, ha='left', va='top', color='black')

    # Set x and y axis titles
    ax.set_xlabel("TPREF_day")
    ax.set_ylabel("TPREF_night")

    # Add frequency labels using annotate
    for (x, y), count in data_subset.groupby(["TPref_day", "TPref_night"]).size().items():
        ax.annotate(count, (x, y), textcoords="offset points", xytext=(0, 3), ha='center', fontsize=10, color='white')

# Adjust layout and show the plot
plt.tight_layout()
plt.savefig("TPref_HTD.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# List of filter conditions
filters = ["cluster_kmeans", "cluster_hierarchical"]

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Loop through filters and create subplots
for i, filter_col in enumerate(filters):
    ax = axes[i]

    # Apply filter condition and plot
    data_subset = merge[merge[filter_col] == 0]
    sns.histplot(merge[merge[filter_col] == 0], x="TPref_day", y="TPref_night", legend=False, discrete=(True, True), ax=ax)

    ticks = range(-1, 2)
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)

    ax.set_xlim(-2, 2)
    ax.set_ylim(-2, 2)

    # Set subplot title
    if filter_col == "cluster_kmeans":
        ax.set_title("K-Means")
        # Add text in the top left corner
        ax.text(-1.8, 1.8, "'Comfortable' cluster (n=92)", fontsize=10, ha='left', va='top', color='black')
    elif filter_col == "cluster_hierarchical":
        ax.set_title("Hierarchical Clustering")
        # Add text in the top left corner
        ax.text(-1.8, 1.8, "'Comfortable' cluster (n=76)", fontsize=10, ha='left', va='top', color='black')

    # Set x and y axis titles
    ax.set_xlabel("TPREF_day")
    ax.set_ylabel("TPREF_night")

    # Add frequency labels using annotate
    for (x, y), count in data_subset.groupby(["TPref_day", "TPref_night"]).size().items():
        ax.annotate(count, (x, y), textcoords="offset points", xytext=(0, 3), ha='center', fontsize=10, color='white')

# Adjust layout and show the plot
plt.tight_layout()
plt.savefig("TPref_LTD.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

"""## **Research sub-question 2**


"""

import statsmodels.api as sm
import statsmodels.formula.api as smf

"""### **Replace extreme values with mean values**


"""

sns.boxplot(data=merge,y="ThermostatTemp",  fill=False, gap= 0.1, fliersize = 3)
plt.show()

temp_hw_true = merge[merge["hw_True"] == True]["ThermostatTemp"].mean()
temp_hw_true

temp_hw_false = merge[merge["hw_True"] == False]["ThermostatTemp"].mean()
temp_hw_false

merge.loc[merge['ThermostatTemp'] == 36.0, 'ThermostatTemp'] = temp_hw_true
merge.loc[merge['ThermostatTemp'] == 35.0, 'ThermostatTemp'] = temp_hw_false

merge[merge["ThermostatTemp"] >= 35][["hw_True","ID","cluster_kmeans","ThermostatTemp","meanTout","TSen_day"]]

"""### **Discomfort - KMeans**


"""

f, ax = plt.subplots(figsize=(5, 5))

# Set the order for legend labels
hue_order = ['Comfortable', 'Uncomfortable']

sns.boxplot(data=merge, x="hw_True", y="ThermostatTemp", hue="cluster_kmeans", fill=False, gap= 0.1, fliersize = 3)

# Add labels to the axis and plot
ax.set(xlabel="Period", ylabel="Indoor temperature (ºC, thermostat)")

plt.xticks([0, 1], ['Non-HW', 'HW'])

# Get the legend object
legend = ax.legend()

# Set legend title
legend.set_title("Cluster")

# Set legend labels
for i, label in enumerate(hue_order):
    legend.get_texts()[i].set_text(label)

# Add "K-means" annotation outside the plot
plt.text(1, 1.05, "K-means", fontsize=12, ha='right', va='top', transform=ax.transAxes)

plt.savefig("temp_kmeans.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

night = (merge["Hour"] >= 23) | (merge["Hour"] < 7)
night.sum()

morning = (merge["Hour"] >= 7) & (merge["Hour"] < 15)
morning.sum()

afternoon = (merge["Hour"] >= 15) & (merge["Hour"] < 23)
afternoon.sum()

merge["cte_23_7h_True"] = night
merge["cte_7_15h_True"] = morning
merge["cte_15_23h_True"] = afternoon

# Define a function to determine the time of the day
def get_time_of_day(row):
    if row['cte_23_7h_True']:
        return '23:00 - 7:00 h'
    elif row['cte_7_15h_True']:
        return '7:00 - 15:00 h'
    elif row['cte_15_23h_True']:
        return '15:00 - 23:00 h'
    else:
        return 'unknown'  # Handle any case where none of the conditions are True

# Apply the function to each row of the DataFrame
merge['time_of_the_day'] = merge.apply(get_time_of_day, axis=1)

# Calculate the overall average temperature for each cluster_kmeans category
avg_temps = merge.groupby('cluster_kmeans')['ThermostatTemp'].median()

f, ax = plt.subplots(figsize=(5, 5))

# Set the order for legend labels
hue_order = ['Comfortable', 'Uncomfortable']

sns.boxplot(data=merge, x="cluster_kmeans", y="ThermostatTemp", hue="time_of_the_day", fill=False, gap= 0.1, fliersize = 3)

# Add labels to the axis and plot
ax.set(xlabel="Cluster", ylabel="Indoor temperature (ºC, thermostat)")

# Set the Y-axis limits
ax.set_ylim(18, 32)

# Set custom X-axis labels
plt.xticks([0, 1], ['Comfortable', 'Uncomfortable'])

# Add overall average lines for each cluster_kmeans category
for idx, temp in avg_temps.items():
    ax.axhline(temp, color='red', linestyle='--', label=f'Overall Avg {["Comfortable", "Uncomfortable"][idx]} ({temp:.2f}ºC)')

# Add "K-means" annotation outside the plot
plt.text(1, 1.05, "K-means", fontsize=12, ha='right', va='top', transform=ax.transAxes)

plt.savefig("temp_kmeans_time_day.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

merge.groupby(["cluster_kmeans", "time_of_the_day"]).agg("count")["ThermostatTemp"]

# Drop the "Marca temporal" column temporarily
merge_without_timestamp = merge.drop(columns=["Timestamp"])

# Perform the groupby operation
result = merge_without_timestamp.groupby(["cluster_kmeans", "time_of_the_day"]).agg("median")["ThermostatTemp"]

# Print the result
print(result)

# Drop the "Marca temporal" column temporarily
merge_without_timestamp = merge.drop(columns=["time_of_the_day","Timestamp"])

# Perform the groupby operation
result = merge_without_timestamp.groupby(["cluster_kmeans"]).agg("median")["ThermostatTemp"]

# Print the result
print(result)

# Group by 'group' and 'Rehab_No' and aggregate the mean of 'temp'
cte_7_15h_True_data = merge[merge["time_of_the_day"] == '7:00 - 15:00 h']
cte_15_23h_True_data = merge[merge["time_of_the_day"] == '15:00 - 23:00 h']
cte_23_7h_True_data = merge[merge["time_of_the_day"] == '23:00 - 7:00 h']

cte_7_15h_True_cluster_True = cte_7_15h_True_data[cte_7_15h_True_data["cluster_kmeans"] == 1]
cte_15_23h_True_cluster_True = cte_15_23h_True_data[cte_15_23h_True_data["cluster_kmeans"] == 1]
cte_23_7h_True_cluster_True = cte_23_7h_True_data[cte_23_7h_True_data["cluster_kmeans"] == 1]
cluster_True = merge[merge["cluster_kmeans"] == 1]

cte_7_15h_True_cluster_False = cte_7_15h_True_data[cte_7_15h_True_data["cluster_kmeans"] == 0]
cte_15_23h_True_cluster_False = cte_15_23h_True_data[cte_15_23h_True_data["cluster_kmeans"] == 0]
cte_23_7h_True_cluster_False = cte_23_7h_True_data[cte_23_7h_True_data["cluster_kmeans"] == 0]
cluster_False = merge[merge["cluster_kmeans"] == 0]

from scipy.stats import mannwhitneyu

# Perform the Mann-Whitney U test
statistic, p_value = mannwhitneyu(cluster_True["ThermostatTemp"], cluster_False["ThermostatTemp"])

# Display the test results
print("Mann-Whitney U Test Results:")
print(f"Statistic: {statistic}")
print(f"P-value: {p_value}")

# Determine if the difference is statistically significant
alpha = 0.05
if p_value < alpha:
    print("The difference between the groups is statistically significant.")
else:
    print("There is no statistically significant difference between the groups.")

"""### **Discomfort - Hclust**


"""

f, ax = plt.subplots(figsize=(5, 5))

# Set the order for legend labels
hue_order = ['Comfortable', 'Uncomfortable']

sns.boxplot(data=merge, x="hw_True", y="ThermostatTemp", hue="cluster_hierarchical", fill=False, gap= 0.1, fliersize = 3)

# Add labels to the axis and plot
ax.set(xlabel="Period", ylabel="Indoor temperature (ºC, thermostat)")

plt.xticks([0, 1], ['Non-HW', 'HW'])

# Get the legend object
legend = ax.legend()

# Set legend title
legend.set_title("Cluster")

# Set legend labels
for i, label in enumerate(hue_order):
    legend.get_texts()[i].set_text(label)

# Add "K-means" annotation outside the plot
plt.text(1, 1.05, "Hierarchical clustering", fontsize=12, ha='right', va='top', transform=ax.transAxes)

plt.savefig("temp_hclust.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# Calculate the overall average temperature for each cluster_hierarchical category
avg_temps = merge.groupby('cluster_hierarchical')['ThermostatTemp'].median()

f, ax = plt.subplots(figsize=(5, 5))

# Set the order for legend labels
hue_order = ['Comfortable', 'Uncomfortable']

sns.boxplot(data=merge, x="cluster_hierarchical", y="ThermostatTemp", hue="time_of_the_day", fill=False, gap= 0.1, fliersize = 3)

# Add labels to the axis and plot
ax.set(xlabel="Cluster", ylabel="Indoor temperature (ºC, thermostat)")

# Set the Y-axis limits
ax.set_ylim(18, 32)

# Set custom X-axis labels
plt.xticks([0, 1], ['Comfortable', 'Uncomfortable'])

# Add overall average lines for each cluster_hierarchical category
for idx, temp in avg_temps.items():
    ax.axhline(temp, color='red', linestyle='--', label=f'Overall Avg {["Comfortable", "Uncomfortable"][idx]} ({temp:.2f}ºC)')

# Add "K-means" annotation outside the plot
plt.text(1, 1.05, "Hierarchical Clustering", fontsize=12, ha='right', va='top', transform=ax.transAxes)

plt.savefig("temp_hierarchical_time_day.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

merge.groupby(["cluster_hierarchical", "time_of_the_day"]).agg("count")["ThermostatTemp"]

# Drop the "Marca temporal" column temporarily
merge_without_timestamp = merge.drop(columns=["Timestamp"])

# Perform the groupby operation
result = merge_without_timestamp.groupby(["cluster_hierarchical", "time_of_the_day"]).agg("median")["ThermostatTemp"]

# Print the result
print(result)

merge.groupby(["hw_True","cluster_hierarchical"]).agg("count")["ThermostatTemp"]

# Drop the "Marca temporal" column temporarily
merge_without_timestamp = merge.drop(columns=["time_of_the_day","Timestamp"])

# Perform the groupby operation
result = merge_without_timestamp.groupby(["cluster_hierarchical"]).agg("median")["ThermostatTemp"]

# Print the result
print(result)

# Group by 'group' and 'Rehab_No' and aggregate the mean of 'temp'
cte_7_15h_True_data = merge[merge["time_of_the_day"] == '7:00 - 15:00 h']
cte_15_23h_True_data = merge[merge["time_of_the_day"] == '15:00 - 23:00 h']
cte_23_7h_True_data = merge[merge["time_of_the_day"] == '23:00 - 7:00 h']

cte_7_15h_True_cluster_True = cte_7_15h_True_data[cte_7_15h_True_data["cluster_hierarchical"] == 1]
cte_15_23h_True_cluster_True = cte_15_23h_True_data[cte_15_23h_True_data["cluster_hierarchical"] == 1]
cte_23_7h_True_cluster_True = cte_23_7h_True_data[cte_23_7h_True_data["cluster_hierarchical"] == 1]
cluster_True = merge[merge["cluster_hierarchical"] == 1]

cte_7_15h_True_cluster_False = cte_7_15h_True_data[cte_7_15h_True_data["cluster_hierarchical"] == 0]
cte_15_23h_True_cluster_False = cte_15_23h_True_data[cte_15_23h_True_data["cluster_hierarchical"] == 0]
cte_23_7h_True_cluster_False = cte_23_7h_True_data[cte_23_7h_True_data["cluster_hierarchical"] == 0]
cluster_False = merge[merge["cluster_hierarchical"] == 0]

from scipy.stats import mannwhitneyu

# Perform the Mann-Whitney U test
statistic, p_value = mannwhitneyu(cte_23_7h_True_cluster_True["ThermostatTemp"], cte_23_7h_True_cluster_False["ThermostatTemp"])

# Display the test results
print("Mann-Whitney U Test Results:")
print(f"Statistic: {statistic}")
print(f"P-value: {p_value}")

# Determine if the difference is statistically significant
alpha = 0.05
if p_value < alpha:
    print("The difference between the groups is statistically significant.")
else:
    print("There is no statistically significant difference between the groups.")

"""### **Survey Time of Response**

#### **K-means**
"""

# Group by 'group' and 'Rehab_No' and aggregate the mean of 'temp'
comf = merge[merge["cluster_kmeans"] == 0]
uncomf = merge[merge["cluster_kmeans"] == 1]

comf["Hour"].median()

uncomf["Hour"].median()

# Perform the Mann-Whitney U test
statistic, p_value = mannwhitneyu(comf["Hour"], uncomf["Hour"])

# Display the test results
print("Mann-Whitney U Test Results:")
print(f"Statistic: {statistic}")
print(f"P-value: {p_value}")

# Determine if the difference is statistically significant
alpha = 0.05
if p_value < alpha:
    print("The difference between the groups is statistically significant.")
else:
    print("There is no statistically significant difference between the groups.")

f, ax = plt.subplots(figsize=(5, 5))

# Set the order for legend labels
label_mapping = {0: 'Comfortable', 1: 'Uncomfortable'}

sns.histplot(merge, x="Hour", legend=True,  ax=ax, stat="percent", hue="cluster_kmeans", kde=True, element="step")
plt.xticks([0, 6, 12, 18, 24])  # Set x-axis ticks to only 0, 6, 12, and 18

# Get the legend object
legend = ax.legend()

# Set legend title
legend.set_title("K-means")

# Set legend handles and labels using the mapping dictionary
handles = [plt.Line2D([0], [0], marker='o', color='w', markersize=10, markerfacecolor='C{}'.format(i)) for i in range(2)]
labels = [label_mapping[i] for i in range(2)]
ax.legend(handles=handles, labels=labels, title="K-means")

plt.savefig("hour_kmeans.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

"""#### **Hierarchical Clustering**


"""

# Group by 'group' and 'Rehab_No' and aggregate the mean of 'temp'
comf = merge[merge["cluster_hierarchical"] == 0]
uncomf = merge[merge["cluster_hierarchical"] == 1]

comf["Hour"].median()

uncomf["Hour"].median()

# Perform the Mann-Whitney U test
statistic, p_value = mannwhitneyu(comf["Hour"], uncomf["Hour"])

# Display the test results
print("Mann-Whitney U Test Results:")
print(f"Statistic: {statistic}")
print(f"P-value: {p_value}")

# Determine if the difference is statistically significant
alpha = 0.05
if p_value < alpha:
    print("The difference between the groups is statistically significant.")
else:
    print("There is no statistically significant difference between the groups.")

f, ax = plt.subplots(figsize=(5, 5))

# Set the order for legend labels
label_mapping = {0: 'Comfortable', 1: 'Uncomfortable'}

sns.histplot(merge, x="Hour", legend=True,  ax=ax, stat="percent", hue="cluster_hierarchical", kde=True, element="step")
plt.xticks([0, 6, 12, 18, 24])  # Set x-axis ticks to only 0, 6, 12, and 18

# Get the legend object
legend = ax.legend()

# Set legend title
legend.set_title("Hierarchical cluster")

# Set legend handles and labels using the mapping dictionary
handles = [plt.Line2D([0], [0], marker='o', color='w', markersize=10, markerfacecolor='C{}'.format(i)) for i in range(2)]
labels = [label_mapping[i] for i in range(2)]
ax.legend(handles=handles, labels=labels, title="Hierarchical cluster")

plt.savefig("hour_hclust.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

"""## **Research sub-question 3 [conducted in RStudio]**

Use the csv files down below to run the code in R. You can find the code in [Github](https://github.com/juan-gamero-salinas/climateready-survey-pamplona)

### **K Means**
"""

para_exportar = merge[['meanTout', 'Gender',
       'Age', 'NatVent_night', 'UsesCoolingAlternatives',
       'hw_True', 'cte_23_7h_True', 'cte_7_15h_True', 'cte_15_23h_True',
       'ShadingDevices_No', 'ShadingDevices_WhenDirectSun',
       'ShadingDevices_AllAfternoon', 'ShadingDevices_AllMorning',
       'ShadingDevices_AllDay', 'NatVent_day_ToutCool',
       'NatVent_day_Anytime', 'NatVent_day_No', 'hasAC_No',
       'hasAC_DormOrLiving', 'hasAC_DormAndLiving', 'hasAC_AllRooms',
       'HouseholdSize', 'is_31001', 'is_31002', 'is_31003', 'is_31004',
       'is_31005', 'is_31006', 'is_31007', 'is_31008', 'is_31009', 'is_31010',
       'is_31011', 'is_31012', 'is_31013', 'is_31014', 'is_31015', 'is_31016',
       'isNot_pamplona', 'before_1980', 'between_1980_2006', 'after_2007',
       "dwelling_OldTown",'dwelling_Block', 'dwelling_Tower', 'dwelling_Detached',
       'dwelling_Other', 'Rehab_No', 'Rehab_Yes', 'Income_below1500',
       'Income_between_1500_3500', 'Income_above_3500', 'Occ_NormallyAtHome',
       'Occ_NotAlwaysAtHome', 'SrfcArea_below90',
       'Storey_UpperFloor', 'Storey_NotApartment',
       'numOrient_1', 'numOrient_above2', 'LivRoom_SrfcAreaWindow_below2',
       'LivRoom_SrfcAreaWindow_above2', 'Bedroom_SrfcAreaWindow_below2',
       'Bedroom_SrfcAreaWindow_above2', 'AC_Installed_Yes',
       'WouldYouInstallAC_Yes', 'hasCoolRoom', 'hasCoolingAlternatives',
       'cluster_kmeans'
       ]]

R_logistic = para_exportar.reset_index(drop=True)
R_logistic.to_csv('R_logistic_kmeans.csv', sep=';', index=False)

"""### **HClust**"""

para_exportar = merge[['meanTout', 'Gender',
       'Age', 'NatVent_night', 'UsesCoolingAlternatives',
       'hw_True', 'cte_23_7h_True', 'cte_7_15h_True', 'cte_15_23h_True',
       'ShadingDevices_No', 'ShadingDevices_WhenDirectSun',
       'ShadingDevices_AllAfternoon', 'ShadingDevices_AllMorning',
       'ShadingDevices_AllDay', 'NatVent_day_ToutCool',
       'NatVent_day_Anytime', 'NatVent_day_No', 'hasAC_No',
       'hasAC_DormOrLiving', 'hasAC_DormAndLiving', 'hasAC_AllRooms',
       'HouseholdSize', 'is_31001', 'is_31002', 'is_31003', 'is_31004',
       'is_31005', 'is_31006', 'is_31007', 'is_31008', 'is_31009', 'is_31010',
       'is_31011', 'is_31012', 'is_31013', 'is_31014', 'is_31015', 'is_31016',
       'isNot_pamplona', 'before_1980', 'between_1980_2006', 'after_2007',
       "dwelling_OldTown",'dwelling_Block', 'dwelling_Tower', 'dwelling_Detached',
       'dwelling_Other', 'Rehab_No', 'Rehab_Yes', 'Income_below1500',
       'Income_between_1500_3500', 'Income_above_3500', 'Occ_NormallyAtHome',
       'Occ_NotAlwaysAtHome', 'SrfcArea_below90',
       'Storey_UpperFloor', 'Storey_NotApartment',
       'numOrient_1', 'numOrient_above2', 'LivRoom_SrfcAreaWindow_below2',
       'LivRoom_SrfcAreaWindow_above2', 'Bedroom_SrfcAreaWindow_below2',
       'Bedroom_SrfcAreaWindow_above2', 'AC_Installed_Yes',
       'WouldYouInstallAC_Yes', 'hasCoolRoom', 'hasCoolingAlternatives',
       "cluster_hierarchical"
       ]]

R_logistic = para_exportar.reset_index(drop=True)
R_logistic.to_csv('R_logistic_hclust.csv', sep=';', index=False)

"""## **Research sub-question 4**"""

from scipy.stats import fisher_exact
import pandas as pd

"""### **Sleep & Heat symptoms**"""

heat = merge["HeatSymptoms"] == True
sleep = merge["SleepingProblems"] == True

merge["HeatSymptoms_SleepingProblems"] = None  # Initialize with None
merge.loc[heat & sleep, "HeatSymptoms_SleepingProblems"] = True
merge.loc[~(heat & sleep), "HeatSymptoms_SleepingProblems"] = False

merge.groupby("HeatSymptoms").size()

merge.groupby("SleepingProblems").size()

# Create a contingency table
contingency_table = pd.crosstab(merge['cluster_kmeans'], merge['HeatSymptoms'])

# Perform Fisher's exact test
odds_ratio, p_value = fisher_exact(contingency_table)

# Output results
print("Contingency Table:")
print(contingency_table)
print("\nOdds Ratio:", odds_ratio)
print("P-value:", p_value)

# Create a contingency table
contingency_table = pd.crosstab(merge['cluster_hierarchical'], merge['HeatSymptoms'])

# Perform Fisher's exact test
odds_ratio, p_value = fisher_exact(contingency_table)

# Output results
print("Contingency Table:")
print(contingency_table)
print("\nOdds Ratio:", odds_ratio)
print("P-value:", p_value)

# Create a bar plot using Seaborn
f, ax = plt.subplots(figsize=(5, 5))
sns.countplot(x='cluster_kmeans', hue='SleepingProblems', data=merge, fill=False)

# Set y-axis limits
ax.set_ylim(0, 100)

# Set labels for the axis and plot with consistent font size
ax.set_xlabel("Cluster [K-means]", fontsize=12)
ax.set_ylabel("Number of observations", fontsize=12)

# Increase font size for x and y tick labels
ax.tick_params(axis='x', labelsize=11)
ax.tick_params(axis='y', labelsize=11)

plt.xticks([0, 1], ['Comfortable', 'Uncomfortable'])

# Add value annotations above each bar
for p in ax.patches:
    height = int(p.get_height())
    if height > 0:  # Check if the height is greater than zero
        ax.text(p.get_x() + p.get_width() / 2, height + 1, f'{height}', ha='center', fontsize=11)

plt.tight_layout()
plt.savefig("ft_kmeans_sleep.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# Create a bar plot using Seaborn
f, ax = plt.subplots(figsize=(5, 5))
sns.countplot(x='cluster_hierarchical', hue='SleepingProblems', data=merge, fill=False)

# Set y-axis limits
ax.set_ylim(0, 100)

# Set labels for the axis and plot with consistent font size
ax.set_xlabel("Cluster [Hierarchical clustering]", fontsize=12)
ax.set_ylabel("Number of observations", fontsize=12)

# Increase font size for x and y tick labels
ax.tick_params(axis='x', labelsize=11)
ax.tick_params(axis='y', labelsize=11)

plt.xticks([0, 1], ['Comfortable', 'Uncomfortable'])

# Add value annotations above each bar
for p in ax.patches:
    height = int(p.get_height())
    if height > 0:  # Check if the height is greater than zero
        ax.text(p.get_x() + p.get_width() / 2, height + 1, f'{height}', ha='center', fontsize=11)

plt.tight_layout()
plt.savefig("ft_hclust_sleep.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# Create a contingency table
contingency_table = pd.crosstab(merge['cluster_kmeans'], merge['SleepingProblems'])

# Perform Fisher's exact test
odds_ratio, p_value = fisher_exact(contingency_table)

# Output results
print("Contingency Table:")
print(contingency_table)
print("\nOdds Ratio:", odds_ratio)
print("P-value:", p_value)

# Create a contingency table
contingency_table = pd.crosstab(merge['cluster_hierarchical'], merge['SleepingProblems'])

# Perform Fisher's exact test
odds_ratio, p_value = fisher_exact(contingency_table)

# Output results
print("Contingency Table:")
print(contingency_table)
print("\nOdds Ratio:", odds_ratio)
print("P-value:", p_value)

# Create a bar plot using Seaborn
f, ax = plt.subplots(figsize=(5, 5))
sns.countplot(x='cluster_kmeans', hue='HeatSymptoms', data=merge, fill=False)

# Set y-axis limits
ax.set_ylim(0, 100)

# Set labels for the axis and plot with consistent font size
ax.set_xlabel("Cluster [K-means]", fontsize=12)
ax.set_ylabel("Number of observations", fontsize=12)

# Increase font size for x and y tick labels
ax.tick_params(axis='x', labelsize=11)
ax.tick_params(axis='y', labelsize=11)

plt.xticks([0, 1], ['Comfortable', 'Uncomfortable'])

# Add value annotations above each bar
for p in ax.patches:
    height = int(p.get_height())
    if height > 0:  # Check if the height is greater than zero
        ax.text(p.get_x() + p.get_width() / 2, height + 1, f'{height}', ha='center', fontsize=11)
plt.tight_layout()
plt.savefig("ft_kmeans_heat.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()

# Create a bar plot using Seaborn
f, ax = plt.subplots(figsize=(5, 5))
sns.countplot(x='cluster_hierarchical', hue='HeatSymptoms', data=merge, fill=False)

# Set y-axis limits
ax.set_ylim(0, 100)

# Set labels for the axis and plot with consistent font size
ax.set_xlabel("Cluster [Hierarchical clustering]", fontsize=12)
ax.set_ylabel("Number of observations", fontsize=12)

# Increase font size for x and y tick labels
ax.tick_params(axis='x', labelsize=11)
ax.tick_params(axis='y', labelsize=11)

plt.xticks([0, 1], ['Comfortable', 'Uncomfortable'])

# Add value annotations above each bar
for p in ax.patches:
    height = int(p.get_height())
    if height > 0:  # Check if the height is greater than zero
        ax.text(p.get_x() + p.get_width() / 2, height + 1, f'{height}', ha='center', fontsize=11)

plt.tight_layout()
plt.savefig("ft_hclust_heat.png", dpi=300)  # Save the plot as PNG with 300dpi
plt.show()