# Constants
HISTOGRAM_DATA = './data/houses_madrid.csv'
CHLOROPETH_DATA_MAP = './data/ne_110m_admin_0_countries/ne_110m_admin_0_countries.shp'
CHLOROPETH_DATA_DENSITY = './data/densityPopulation.csv'
CHLOROPETH_DATA_SAVED_MAP = './assets/chloropeth_map.html'
MARIMEKKO_DATA = './data/01 Presupuestos Generales del Estado Consolidados.xlsx'

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


# Set interval for visualization
INTERVAL = 50000

# Load data
df_histogram = pd.read_csv(HISTOGRAM_DATA, sep=',')

# Convert 'price' to numeric and remove NA values
df_histogram['price'] = pd.to_numeric(df_histogram['price'], errors='coerce')
df_cleaned = df_histogram.dropna(subset=['price'])

# Calculate IQR
Q1 = df_cleaned['price'].quantile(0.25)
Q3 = df_cleaned['price'].quantile(0.75)
IQR = Q3 - Q1

# Remove outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_histogram_no_outliers = df_cleaned[(df_cleaned['price'] >= lower_bound) & (df_cleaned['price'] <= upper_bound)]

# Define bin edges from 0 to the maximum value in intervals of 25,000
max_price = df_histogram_no_outliers['price'].max()
bins = np.arange(0, max_price + INTERVAL, INTERVAL)

# Show histogram with specified bins
plt.figure(figsize=(10, 6))
df_histogram_no_outliers['price'].plot(kind='hist', bins=bins, color='skyblue', edgecolor='black')
plt.title('Price Distribution without Outliers')
plt.xlabel('Price (€)')
plt.ylabel('Frequency')
plt.grid(axis='x', alpha=0.7)
plt.show()

import geopandas as gpd
import pandas as pd
import folium

# Load world booundaries data and density population data
shapefile_path = CHLOROPETH_DATA_MAP
world_boundaries = gpd.read_file(shapefile_path)
csv_path = CHLOROPETH_DATA_DENSITY
df_density = pd.read_csv(csv_path)

# This has been used to check column names and see what columns could be useful for this purpose.
# The same procedure has been made for density.
# for c in world.columns.to_list():
#     print(c)

# for c in df_density.columns.to_list():
#     print(c)

# Rename for merge both dataframes
world_boundaries = world_boundaries.rename(columns={'NAME': 'country'})
df_density = df_density.rename(columns={'name': 'country'})

# Filter by CONTINENT == Europe
europe = world_boundaries[world_boundaries['CONTINENT'] == 'Europe']

# Merge Europe with density using country
merged_europe = europe.merge(df_density, on='country', how='left')

# Initialize a Folium map centered on Europe
map = folium.Map(location=[54, 15], zoom_start=4)  # Centered on Europe

# Add the choropleth layer for European countries with red-blue gradient
folium.Choropleth(
    geo_data=merged_europe,      # GeoDataFrame for Europe with density data
    data=merged_europe,          # Data source for density values
    columns=['country', 'Density'],  # Columns to match: country and density
    key_on='feature.properties.country',  # Key to match GeoJSON feature name
    fill_color='YlOrRd',        # Color gradient from red (high) to blue (low)
    fill_opacity=1,
    line_opacity=1,
    legend_name='Density Population'
).add_to(map)

# Save and display
map.save(CHLOROPETH_DATA_SAVED_MAP)
map  # Display map if running in Jupyter Notebook or similar environment

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load and prepare data
file_path = MARIMEKKO_DATA
budgets_df = pd.read_excel(file_path, sheet_name='141', skiprows=5, nrows=28)
budgets_df = budgets_df.rename(columns={'Políticas': 'policy'})
budgets_df.columns = budgets_df.columns.astype(str)
budgets_df.head(5)

# Selected policies and years
selected_policies = ['Sanidad', 'Defensa', 'Educación', 'Cultura']
selected_years = ['2021', '2022', '2023']

# Filter data
selected_budgets_df = budgets_df[budgets_df['policy'].isin(selected_policies)]
selected_budgets_df = selected_budgets_df[['policy'] + selected_years]

# Transform policy into an index and cast everything to numeric
selected_budgets_df = selected_budgets_df.set_index('policy')
selected_budgets_df.apply(pd.to_numeric)
selected_budgets_df

'''
This code is based on the one I found at:
https://curbal.com/curbal-learning-portal/90-of-100-marimekko-chart-in-matplotlib
'''
# Calculate the total for each year
totals_per_year = selected_budgets_df.sum(axis=0)

# Calculate percentages
percentages = selected_budgets_df / totals_per_year * 100

# Width calculation
widths = totals_per_year / totals_per_year.sum() * 100

# Set colors
policy_colors = {
    'Defensa': 'orange',
    'Sanidad': 'green',
    'Educación': 'yellow',
    'Cultura': 'purple'
}

# Marimekko chart
fig, ax = plt.subplots(figsize=(10, 6))
x_start = np.zeros(len(selected_budgets_df))

# Add each policy's data for each year
for i, year in enumerate(selected_budgets_df.columns):
    for j, policy in enumerate(selected_budgets_df.index):
        ax.bar(
            x_start[j],
            percentages.loc[policy, year],
            width=widths[i],
            bottom=np.sum(percentages.iloc[:j, i]), 
            label=policy if i == 0 else "",
            color=policy_colors[policy],
            align='edge', # separate boxes
            edgecolor='black'
        )

    x_start += widths[i]

# Add labels and title
ax.set_xlabel('Year')
ax.set_ylabel('Percentage')
ax.set_title('Marimekko Chart - Budget Allocation by Policy (2021-2023)')

# Add years and their corresponding percentages to the X-axis
xticks = np.cumsum(widths) - widths / 2
xtick_labels = [
    f'{year} -> {totals_per_year[year] / totals_per_year.sum() * 100:.2f}%'\
        for year in selected_budgets_df.columns
]

ax.set_xticks(xticks)
ax.set_xticklabels(xtick_labels)

# Add legend
ax.legend(selected_budgets_df.index, title='Policy')

# Show the chart
plt.tight_layout()
plt.show()

/var/folders/_1/_rnshwnd0px1m82vsfm_3cg80000gn/T/ipykernel_37746/1885233174.py:32: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  width=widths[i],
/var/folders/_1/_rnshwnd0px1m82vsfm_3cg80000gn/T/ipykernel_37746/1885233174.py:40: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`
  x_start += widths[i]

	policy	2014	2015	2016	2017	2018	2018-P	2019-P	2021	2022	2023
0	Justicia	1500.75404	1508.15445	1604.31179	1726.19093	1780.74441	1779.72937	1779.72937	2048.13086	2283.54940	2291.31848
1	Defensa	5654.45439	5711.68742	5734.29191	7575.59287	8400.56154	8400.56154	8400.56154	9072.01433	9790.81151	12316.82907
2	Seguridad ciudadana e Instituciones penitencia...	7880.95326	7843.12934	7903.61760	7912.33049	8418.13871	8418.08732	8418.08732	9694.41496	10148.79292	10719.20460
3	Política Exterior y de cooperación para el des...	1395.16909	1422.42613	1477.93985	1521.69850	1581.44350	1581.44350	1581.44350	1882.00770	2253.51982	2425.50604
4	Pensiones	127483.83335	131658.53137	135448.92579	139646.72308	144834.30566	144834.30566	144834.30566	163296.58073	171139.65331	190687.24615

	2021	2022	2023
policy
Defensa	9072.01433	9790.81151	12316.82907
Sanidad	7329.68378	6606.04994	7049.08276
Educación	4893.45613	5022.78388	5354.97335
Cultura	1148.06383	1589.33219	1803.68982

Preparación¶

1. Histograma¶

Historia y Origen¶

Pros y Contras¶

Tipos de Datos que Admite¶

Aplicaciones Típicas¶

2. Mapa Coroplético¶

Origen y Autoría¶

Pros y Contras¶

Tipos de Datos que Admite¶

Aplicaciones Típicas¶

3. Gráfico Marimekko¶

Historia y Origen¶

Pros y Contras¶

Tipos de Datos que Admite¶

Aplicaciones Típicas¶