-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathali_2_HealthGPS_GlobalParamGenerator.R
379 lines (293 loc) · 13.2 KB
/
ali_2_HealthGPS_GlobalParamGenerator.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
################################################################################
######################## Libraries and Packages Initialisation##################
## Package setup
# Package install
package_list <- c("stargazer", "ggplot2", "dplyr", "nnet", "zoo", "moments",
"MASS", "tidyr", "corrplot")
new_packages <- package_list[!(package_list %in%
installed.packages()[, "Package"])]
if(length(new_packages)) install.packages(new_packages)
# Package load
lapply(package_list, require, character.only = TRUE)
################################################################################
######################## Parameters ############################################
#File to use
file_name <- "data/maxime-180324/Health_GPS_sc_effect_main.csv"
# (e.g., 0.5 for half the rows)
country = "India"
rate_to_print <- 0.2
#Filter threshold
rate_to_filter <- 0.01
#Smoothing parameters
smoothing = TRUE
repeated_smoothing = 150
################################################################################
######################## Data Loading and Variable Extraction ##################
# Load the CSV data into a data frame
data <- read.csv(file_name)
# Attach the data frame for easier variable referencing
attach(data)
# Extracting variables from the 'data' data frame
sex <- data$ind_gender
age <- data$ind_age
age1 <- data$ind_age
age2 <- data$ind_age * data$ind_age
age3 <- data$ind_age * data$ind_age * data$ind_age
inc <- data$hh_tercile
protein <- data$ind_protein_g
carb <- data$ind_carb_g
fat <- data$ind_tot_fat_g
sodium <- data$ind_sodium_mg
sector <- data$hh_sector
energy_original <- data$ind_energy_kcal
# Calculate 'energy' based on the standard formula
energy <- 4 * carb + 9 * fat + 4 * protein + 0 * sodium
################################################################################
########################Filtering###############################################
# Create 'subdata' dataframe
subdata <- data.frame(sex, age, age1, age2, age3, inc, sector, carb, fat, protein, sodium, energy, energy_original)
# Remove rows with non-numeric NaN or empty values
subdata <- subdata[complete.cases(subdata), ]
# Set upper and lower quantiles
lower_q <- rate_to_filter
upper_q <-1-lower_q
# Filter 'subdata' based on conditions
df <- subdata %>%
filter(
age <= 100, # Age less than 100
carb > quantile(carb, lower_q) & carb < quantile(carb, upper_q), # Carb within quantiles
fat > quantile(fat, lower_q) & fat < quantile(fat, upper_q), # Fat within quantiles
protein > quantile(protein, lower_q) & protein < quantile(protein, upper_q), # Protein within quantiles
sodium > quantile(sodium, lower_q) & sodium < quantile(sodium, upper_q), # Sodium within quantiles
energy > quantile(energy, lower_q) & energy < quantile(energy, upper_q) # Energy within quantiles
)
################################################################################
########################ToolBox#################################################
get_smooth_factor <- function(initial_factor, times) {
if(times<1)
return(initial_factor)
res <- initial_factor
for (j in 1:times) {
tmp <- res
for (p in 1:length(res)) {
tmp[p] <- res[p]
}
sum <- 0
for (i in 1:length(res)) {
if (i == 1) {
res[i] <- (2 * tmp[i] + tmp[i + 1]) / 3
} else if (i == length(tmp)) {
res[i] <- (tmp[i - 1] + 2 * tmp[i]) / 3
} else {
res[i] <- (tmp[i - 1] + tmp[i] + tmp[i + 1]) / 3
}
sum <- sum + res[i]
}
}
return(res)
}
################################################################################
########################Calculate Means and Create CSV Files####################
# Calculate mean values for 'carb', 'fat', 'protein', 'sodium', and 'energy'
# based on 'age' and 'sex'
result <- df %>%
group_by(age, sex) %>%
summarize(
carb_mean = mean(carb),
fat_mean = mean(fat),
protein_mean = mean(protein),
sodium_mean = mean(sodium),
energy_mean = mean(energy)
)
# Merge the calculated mean values back to the original dataframe ('df')
result$sex = ifelse(result$sex=="1","Male","Female")
# Reorder the rows based on the 'sex' column
result <- result[order(result$sex), ]
# Remove the "_mean" suffix from column names
colnames(result) <- sub("_mean$", "", colnames(result))
# Change the "energy" to "energyintake"
colnames(result) <- sub("energy", "energyintake", colnames(result))
# Split the data frame based on 'sex'
male_data <- result[result$sex == "Male", ]
female_data <- result[result$sex == "Female", ]
# List of columns to smooth
columns_to_smooth <- c("carb", "fat", "protein", "sodium","energyintake")
# Apply the smoothing function to selected columns
male_data[, columns_to_smooth] <- lapply(male_data[, columns_to_smooth], get_smooth_factor, repeated_smoothing)
female_data[, columns_to_smooth] <- lapply(female_data[, columns_to_smooth], get_smooth_factor, repeated_smoothing)
# Variables to plot
variables_to_plot <- c("carb", "fat", "protein", "sodium", "energyintake")
# Loop through each variable and create a separate plot
for (variable in variables_to_plot) {
# Plotting male_data
plot(male_data$age, male_data[[variable]],
type="l",
lwd=3,
col="blue",
main=paste("Average ", variable, " by sex and age"),
xlab="Age",
ylab=variable,
ylim = c(1000,2500))
# Plotting female_data
lines(female_data$age, female_data[[variable]], col="red", lwd=3)
# Adding legend
legend("topright", legend=c("Male", "Female"), col=c("blue", "red"), lwd=3)
}
# Write CSV files for Male and Female
write.csv(male_data, "out/Male.FactorMeans.csv", row.names = FALSE)
write.csv(female_data, "out/Female.FactorMeans.csv", row.names = FALSE)
################################################################################
######################################Rural Sector: Prevalence##################
# Subset the dataframe into two age groups (under 18 and above 18)
df_under_18 <- df[df$age < 18, ]
df_above_18 <- df[df$age >= 18, ]
# Calculate the sector prevalence for males and females in each age group
prevalence_under_18 <- aggregate(sector ~ sex, data = df_under_18, FUN = function(x) sum(x == 1) / length(x))
prevalence_above_18 <- aggregate(sector ~ sex, data = df_above_18, FUN = function(x) sum(x == 1) / length(x))
# Create bar plots for each age group and gender
par(mfrow = c(2, 1)) # Create a 2x1 grid for plots
# Bar plot for individuals under 18
barplot(prevalence_under_18$sector,
beside = TRUE,
main = "Rural Prevalence for Individuals Under 18",
xlab = "Gender",
ylab = "Prevalence",
col = c("pink", "blue"),
names.arg = c("Female", "Male"))
# Bar plot for individuals above 18
barplot(prevalence_above_18$sector,
beside = TRUE,
main = "Rural Prevalence for Individuals Above 18",
xlab = "Gender",
ylab = "Prevalence",
col = c("pink", "blue"),
names.arg = c("Female", "Male"))
# Reset the plot layout
par(mfrow = c(1, 1))
# Combine the results into a single data frame
prevalence_under_18$age = c("Under 18", "Under 18")
prevalence_above_18$age = c("Above 18", "Above 18")
result <- rbind(prevalence_under_18, prevalence_above_18)
result$sex = ifelse(result$sex=="1","Male","Female")
# Reorder columns (sex, age, sector)
result <- result[, c("sex", "age", "sector")]
# Write the result to a CSV file
write.csv(result, "out/rural_prevalence.csv", row.names = FALSE)
################################################################################
####################################Income######################################
# Calculate the prevalence by sex and income group
prevalence_table <- table(df$sex, df$inc) / rowSums(table(df$sex, df$inc))
# Convert the table to a data frame
prevalence_df <- as.data.frame.matrix(prevalence_table)
# Create a bar plot
barplot(as.matrix(prevalence_df),
beside = TRUE,
main = "Prevalence of Income Groups by Sex",
xlab = "Sex (0 = Female, 1 = Male)",
ylab = "Prevalence",
col = c("pink", "blue"),
legend.text = TRUE,
names.arg = c("Tercile 1", "Tercile 2", "Tercile 3"))
# Subset the data for cases where sector is equal to 1
data_subset <- df[df$sector == 0, ]
# Calculate the prevalence by sex and income group within sector 1
prevalence_table <- table(data_subset$sex, data_subset$inc) / rowSums(table(data_subset$sex, data_subset$inc))
# Convert the table to a data frame
prevalence_df <- as.data.frame.matrix(prevalence_table)
# Create a bar plot
barplot(as.matrix(prevalence_df),
beside = TRUE,
main = "Prevalence of Income Groups by Sex (Urban)",
xlab = "Sex (0 = Female, 1 = Male)",
ylab = "Prevalence",
col = c("pink", "blue"),
legend.text = TRUE,
names.arg = c("Tercile 1", "Tercile 2", "Tercile 3"))
df$over18 <- ifelse(df$age < 18, 0, 1)
# Subset the data for cases where sector is equal to 1
data_subset <- df
# Calculate the prevalence by sex and income group within sector 1
prevalence_table <- table(data_subset$over18, data_subset$inc) / rowSums(table(data_subset$over18, data_subset$inc))
# Convert the table to a data frame
prevalence_df <- as.data.frame.matrix(prevalence_table)
# Create a bar plot
barplot(as.matrix(prevalence_df),
beside = TRUE,
main = "Prevalence of Income Groups by Age Groups",
xlab = "AgeGroup (0 = Under 18, 1 = Over 18)",
ylab = "Prevalence",
col = c("green", "red"),
legend.text = TRUE,
names.arg = c("Tercile 1", "Tercile 2", "Tercile 3"))
# Fit a multinomial logistic regression model
model <- multinom(inc ~ sex + over18 + sector, data = df)
summary(model)
# 26/03/24 – 17:09
# This section appears to throw an error
# Error in model$fitted :
# (converted from warning) partial match of 'fitted' to 'fitted.values'
model
coefficients <- summary(model)$coefficients
# Create a new row with all values as 0
new_row <- c(0, 0, 0, 0)
# Add the new row to your existing data frame
coefficients <- rbind(coefficients, new_row)
rownames(coefficients)[rownames(coefficients) == "new_row"] <- "1"
coefficients <- coefficients[order(rownames(coefficients)), ]
write.csv(coefficients,"income_model.csv")
#### Carrying on from here.
# Generate predicted probabilities for each class
predicted_probs <- predict(model, type = "probs")
# Sample from the predicted probabilities
n_samples <- 1 # Replace with the desired number of samples
samples <- apply(predicted_probs, 1, function(probs) sample(1:length(probs), size = n_samples, prob = probs))
df$samples = samples
data_subset <- df[df$sex == 0, ]
# Calculate the prevalence of each category for 'samples' and 'inc'
prevalence_samples <- table(data_subset$samples) / length(data_subset$samples)
prevalence_inc <- table(data_subset$inc) / length(data_subset$inc)
# Convert prevalence values to numeric
prevalence_samples <- as.numeric(prevalence_samples)
prevalence_inc <- as.numeric(prevalence_inc)
# Combine the prevalence data into a single dataframe
prevalence_data <- data.frame(Category = c("Tercile 1", "Tercile 2", "Tercile 3"),
Samples = prevalence_samples,
Income = prevalence_inc)
# Create a bar plot
barplot(
t(as.matrix(prevalence_data[, 2:3])),
beside = TRUE,
main = "Female Income: Prevalence Comparison of 'model' and 'data'",
xlab = "Tercile",
ylab = "Prevalence",
col = c("pink", "blue"),
legend.text = TRUE,
names.arg = prevalence_data$Category
)
################################################################################
####################################Data File Creation##########################
output_age = df$age
output_sex = ifelse(df$sex=="1","Male","Female")
output_income = df$inc
output_sector = ifelse(df$sector=="0","Rural","Urban")
output_carb = df$carb
output_fat = df$fat
output_protein = df$protein
output_sodium = df$sodium
output_energyintake = df$energy
output_df = data.frame(output_sex, output_age,output_sector,output_income,output_carb,output_fat,output_protein,output_sodium,output_energyintake)
colnames(output_df) = c("sex","age","sector","income","carb","fat","protein","sodium","energyintake")
# Calculate the number of rows to print
rows_to_print <- round(nrow(df) * (rate_to_print))
# Sample the rows to print
sampled_rows <- sample(1:nrow(df), size = rows_to_print)
# Create output dataframe with sampled rows
sampled_df <- output_df[sampled_rows, ]
# Order the dataframe by age and sex
sampled_df <- sampled_df[order(sampled_df$age, sampled_df$sex), ]
# Set row names from 1 to the last row
rownames(sampled_df) <- 1:nrow(sampled_df)
# Write the output dataframe to a CSV file
write.csv(sampled_df, file = paste0("out/",country, ".DataFile.Incomplete.csv"))
#####################################################################################
#####################################################################################