-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreating-graphs.py
105 lines (78 loc) · 4.11 KB
/
creating-graphs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
## Loading Data ##
## We´re using pandas to recieve and manipulate data!
import pandas as pd
## graphs library
import matplotlib.pyplot as plt # type: ignore
## Setting the path to our files.
path_data = "../data-analysis/pastas/2023_Viagem.csv"
## On the pandas, we use encoding to submit the format of coding in the file
## And the sep, its the separator used, in case its unusual.
df_viagens = pd.read_csv(path_data, encoding="Windows-1252", sep=";")
## Here it allows us to see all columns, or rows. Setting a limit or not.
pd.set_option('display.max_columns', None)
## Adding this option, we can adjust the float format, by only using 2 decimals characters.
pd.set_option("display.float_format", "{:.2f}".format)
## Creating a new column, with all of the expenses.
df_viagens["Despesas"] = df_viagens["Valor diárias"] + df_viagens["Valor passagens"] + df_viagens["Valor outros gastos"]
## We can also just rename then in a single column.
df_viagens["Cargo"] = df_viagens["Cargo"].fillna("NÃO IDENTIFICADO")
## Here were changing the dates, which has string values, to datetime.
df_viagens["Período - Data de início"] = pd.to_datetime(df_viagens["Período - Data de início"],format="%d/%m/%Y")
df_viagens["Período - Data de fim"] = pd.to_datetime(df_viagens["Período - Data de fim"],format="%d/%m/%Y")
## Here we get the months names that the trip started
df_viagens["Mês da viagem"] = df_viagens["Período - Data de início"].dt_month_name()
## Here we collect the amount of days of the trip. Returning in days.
df_viagens["Dias da viagem"] = df_viagens["Período - Data de fim"] - df_viagens["Período - Data de início"].dt.days # Here only getting the numbers of days.
## Here we aggregate, a new table, basing the data on another columns of the original table, naming them and passing the method used.
df_travels_consolidates = (
df_viagens ## We get the table
.groupby("Cargo") ## Sort by "cargo"
.agg( ## Aggregate these columns
mean_expenses=("Despesas", "mean"), ## Based on "Dispesas" and the method "mean"
mean_days=("Dias de viagem", "mean"),
total_expenses=("Despesas", "sum"),
frequent_destination=("Destinos", pd.Series.mode), ## Getting the most frequent destinations
n_travels=("Nome", "count"), ## counting by name
)
.reset_index() ## And using them as the columns
)
## storing the proportion in a variable
df_cargos = df_viagens["Cargo"].value_counts(normalize=True).reset_index()
## Getting a column, getting a parameter, and filtering also, toget only "Cargo"
relevant_cargos = df_cargos.loc[df_cargos["proportion"] > 0.01, "Cargo"]
## Geting a filter, that checks if the column "Cargo" pairs with the previous filter of proportion > 0.01.
filter = df_travels_consolidates["Cargo"].isin(relevant_cargos)
## Getting on the final table - consolidated and filtered
df_final = df_travels_consolidates[filter].sort_values(by="n_travels", ascending=False)
fig, ax = plt.subplots(figsize=(16,6))
## horizontals bars with
ax.barh(df_final["Cargo"], df_final["n_travels"], color="#536674")
## inverting the y axis
ax.invert_yaxis()
## setting the backgroundcolor
ax.set_facecolor("#fff")
## defying the title of the fig
fig.suptitle("Viagens por carbo público (2023)")
## creating a subtitle text
plt.figtext(0.65,0.89,"Fonte: Portal da Transparência", fontsize=8)
## a grid to give a north
plt.grid(color="gray", linestyle="--", linewidth=0.5)
## setting the fontsize of the y labls
plt.yticks(fontsize=8)
## creating a text for the numbers in the x axis
plt.xlabel("Número de viagens")
## method to show graph
plt.show()
## Creating the figure
fig, ax = plt.subplots(figsize=(16,6))
## Creating a point graph, comparing more days of traveling, and the expenses.
ax.scatter(df_viagens["Dias da viagem"], df_viagens["Despesas"], alpha=0.4)
# ax.set_xlim(0,100)
# ax.set_ylim(0,25000)
## Filtering travels that has an expense bigger than 175000
filter_175 = (df_viagens["Despesas"] > 175000).value_counts()
df_viagens[filter_175]
## creating a pathh to the figure
path_figure = "../data-analysis/pastas/figure.png"
## saving the figure
plt.savefig(path_figure, bbox_inches="tight")