-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfinal-project-write-up-nathan-nguyen.tex
993 lines (823 loc) · 39.7 KB
/
final-project-write-up-nathan-nguyen.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
]{article}
\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
\usepackage{unicode-math} % this also loads fontspec
\defaultfontfeatures{Scale=MatchLowercase}
\defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
% xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
\usepackage[]{microtype}
\UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
\IfFileExists{parskip.sty}{%
\usepackage{parskip}
}{% else
\setlength{\parindent}{0pt}
\setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
\KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\usepackage[margin=1in]{geometry}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\ifLuaTeX
\usepackage{luacolor}
\usepackage[soul]{lua-ul}
\else
\usepackage{soul}
\fi
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
\setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{-\maxdimen} % remove section numbering
\usepackage{amsmath}
\usepackage{sectsty} \allsectionsfont{\centering}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\ifLuaTeX
\usepackage{selnolig} % disable illegal ligatures
\fi
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same}
\hypersetup{
pdfauthor={Nathan A. Nguyen},
colorlinks=true,
linkcolor={Maroon},
filecolor={Maroon},
citecolor={Blue},
urlcolor={blue},
pdfcreator={LaTeX via pandoc}}
\title{\vspace{5cm}
GIS 563
Local Statistical Modeling
Final Project}
\author{Nathan A. Nguyen}
\date{10 December 2024}
\begin{document}
\maketitle
\newpage
\section{\texorpdfstring{\ul{Introduction}}{Introduction}}\label{introduction}
This write-up serves as the second part submission for the final project
in the class.
For this project, an empirical application of MGWR was performed and
compared with a global OLS model. The response variable was the
estimated median household income in 2021, and the spatial units were
United States counties. Details about the dataset and any preprocessing
that occurred will be discussed. The methods section will cover the
models explored and the software implementation as well as what R
packages were used for mapping for those who are interested. Finally
this write-up will close with brief discussions of the results and any
room for improvements should this be a real academic project.
The model presented in this write-up will deviate slightly from the
model presented in part 1. The modifications were made to accommodate
critiques while presenting -- namely the use of poverty level as a
predictor. This variable was replaced with the percentage of the
population in a respective county that are considered in an urban area.
During the initial presentation, the Monte Carlo test was still running,
so no results were available.
Due to time constraints, and unexpected events, a Monte Carlo test for
the existence of spatial variability was not performed for the model
presented in this write-up. The Monte Carlo test for the first version
of this model did complete eventually, and it suggested that the only
variable with evidence for spatial variability was the all age poverty
levels in 2021. I included this variable initially because although
poverty is obviously associated with income, I wanted to see whether or
not the effects of poverty on median income were uniform across space or
if they changed based on geography.
That being said, a rigorous test for spatial variance will not be
provided for this second model presented.
\newpage
\section{\texorpdfstring{\ul{Data
Details}}{Data Details}}\label{data-details}
The dataset used for analysis is an amalgamation of various datasets
from the United States Census Bureau/United States Department of
Commerce, the American Community Survey, the United States Department of
Agriculture's Economic Research Service, and from a 2022 paper by
Fotheringham et al.,\textsuperscript{1} .
The area of study were United States counties, and only mainland
counties were intended to be retained in the dataset. For transparency,
most of Connecticut is missing and this issue was not observed until
after-the-fact. The missingness is attributed to the non-standardization
of FIP codes among the various datasets used. Some locations in
Connecticut are not considered true counties and are ``county
equivalents'', which was not known a-priori.
After preprocessing, the final dataset consisted of 3,100 locations.
Some R functions were defined in order to assist the preprocessing step
-- namely cleaning of FIP codes and joining all of the datasets
together.
The chosen response variable was the estimated median household income
in the year 2021 and was provided by the Census Bureau/Department of
Commerce\textsuperscript{6}. Nine predictors were included in the
models:
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
Gini Index (1-year estimate; 2021)\textsuperscript{5}
\item
Population Density (natural logged)\textsuperscript{1}
\item
Percent of Households with Internet Access (5-year estimate;
2017-2022)\textsuperscript{2}
\item
Percent of Population with Bachelors Degree or
Higher\textsuperscript{3}
\item
Percent of Population Living in an Urban Area(1-year estimate;
2020)\textsuperscript{6}
\item
Sex Ratio (Male-to-Female) (5-year estimate;
2017-2022)\textsuperscript{4}
\item
Median Age (5-year estimate; 2017-2022)\textsuperscript{4}
\item
Percent Population that is Black (5-year estimate;
2017-2022)\textsuperscript{4}
\item
Percent Population that is Hispanic or Latino (5-year estimate;
2017-2022)\textsuperscript{4}
\end{enumerate}
\begin{longtable}[t]{lcccc}
\caption{\label{tab:unnamed-chunk-2}Summary Statistics for Selected Variables}\\
\toprule
Variable & Min & Mean & Median & Max\\
\midrule
Median Income (21) & 25653.00 & 58741.99 & 56465.50 & 153716.00\\
Gini Index (17-21) & 0.25 & 0.45 & 0.44 & 0.73\\
Population Density (Natural Log) & -1.93 & 3.78 & 3.78 & 10.77\\
\% Internet Access (21) & 35.97 & 82.78 & 83.89 & 100.00\\
\% with Bachelor's Degree or Higher (18–22) & 0.00 & 23.44 & 20.90 & 78.90\\
\addlinespace
\% Population in Urban Area (20) & 0.00 & 35.95 & 33.41 & 100.00\\
Sex Ratio (Male:Female, 17–21) & 76.90 & 101.93 & 99.60 & 221.30\\
Median Age (17–21) & 22.40 & 41.52 & 41.30 & 68.10\\
\% Black (17–21) & 0.00 & 9.03 & 2.26 & 87.12\\
\% Hispanic or Latino (17–21) & 0.00 & 9.82 & 4.49 & 98.22\\
\bottomrule
\end{longtable}
\newpage
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/median-income21}
}
\caption{Estimated Median Household Income (2021)}\label{fig:unnamed-chunk-3}
\end{figure}
Figure 1 shows the estimated median household income in 2021. By
inspection, there appears to be some clustering of this random variable.
For example, we see that the median household income is generally higher
in the north-east coast of the United States, indicated by the darker
coloring, when compared to the deep south and Appalachia, indicated by
the lighter shading. The west coast, generally, has higher income as
well.
This is likely due to the fact that the north-east and west-coast are
more developed regions of the country with high paying industries like
finance and technology while the regions with lower income are more
rural and have declining industries e.g., manufacturing and mining.
\newpage
Furthermore, the regions with higher median household income generally
have higher educational attainment as well. Figure 2 is evidence of
this:
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/bach-higher-18-22}
}
\caption{Percent of Population Having Bachelors Degree or Higher}\label{fig:unnamed-chunk-4}
\end{figure}
\newpage
\section{\texorpdfstring{\ul{Methods}}{Methods}}\label{methods}
A global OLS and MGWR model was fit on the dataset and compared to one
another. The variance explained in both models were compared as well as
the AICc. All variables were standardized to have mean zero and and
variance one, and standardization was performed in R. All features in
the model are of order one, and no interaction terms were considered.
A linear model was fit in R using the \texttt{lm()} function while the
MGWR model was fit using the MGWR 2.2 GUI software. I also calibrated an
MGWR with the \texttt{mgwr} python module, but I ran into issues while
extracting some data.
Ad-hoc tests were used in place of a Monte Carlo test:\\
\[
\begin{aligned}
IQR_{k} &> 2\times SE_{k-global}
\end{aligned}
\]
Corrected \(\alpha\)-values were computed following:
\[
\begin{aligned}
\alpha_{j} &= \frac{\alpha^{*}}{ENP_{j}}
\end{aligned}
\]
where \(\alpha^{*} = 0.05\) and the \(ENP_{j}\) were obtained from the
\texttt{txt} file from the MGWR 2.2 session.
\newpage
\subsection{\texorpdfstring{\ul{Global OLS
Model}}{Global OLS Model}}\label{global-ols-model}
\begin{table}[H]
\renewcommand{\arraystretch}{1.5} % Adjust row spacing
\centering
\caption{Global OLS Results}
\label{tab:ols_results}
\begin{tabular}{lcccc}
\hline
\textbf{Variable} & \textbf{Estimate} & \textbf{Std. Error} & \textbf{t-value} & \textbf{p-value} \\ \hline
Intercept & 5.87e-17 & 1.00e-02 & 0.000 & 1.000 \\
Gini Index (17–21) & -0.259 & 0.011 & -22.595 & $<$2e-16 *** \\
Population Density (Log) & 0.179 & 0.016 & 11.468 & $<$2e-16 *** \\
\% Internet Access (21) & 0.237 & 0.015 & 15.627 & $<$2e-16 *** \\
\% Bachelor's Degree or Higher (18–22) & 0.579 & 0.014 & 41.660 & $<$2e-16 *** \\
\% Population in Urban Area (20) & -0.124 & 0.018 & -6.994 & 3.26e-12 *** \\
Sex Ratio (Male:Female, 17–21) & 0.351 & 0.107 & 3.295 & 0.000997 *** \\
Median Age (17–21) & 0.099 & 0.118 & 0.845 & 0.398 \\
\% Black (17–21) & -0.059 & 0.012 & -4.929 & 8.70e-07 *** \\
\% Hispanic or Latino (17–21) & 0.080 & 0.012 & 6.975 & 3.73e-12 *** \\
\hline
\multicolumn{5}{l}{\textit{Residual Standard Error:} 0.5578 on 3090 degrees of freedom} \\
\multicolumn{5}{l}{\textit{Multiple R-squared:} 0.6897, \textit{Adjusted R-squared:} 0.6888} \\
\multicolumn{5}{l}{\textit{F-statistic:} 763.3 on 9 and 3090 DF, \textit{p-value:} $<$2.2e-16} \\
\hline
\multicolumn{5}{l}{\textbf{Signif. Codes:} *** $p < 0.001$, ** $p < 0.01$, * $p < 0.05$} \\
\end{tabular}
\end{table}
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-6-1}
}
\caption{Global OLS Added Variable Plots}\label{fig:unnamed-chunk-6}
\end{figure}
A summary of the global OLS model results is contained in Table 2. The
global model is able to explain approximately \(69\%\) of the variance
of standardized median income (adjusted \(R^{2}\)). Among all
predictors, all are statistically significant at \(\alpha = 0.05\)
except for median age. A side comment is that with sufficiently large
sample sizes, any non-trivial effects will be statistically significant.
Effect sizes might be a better measure of model quality in the future.
Educational attainment had the largest positive effect on median income
as seen in Table 2 as well as the added variable plots (Figure 3). A one
standard deviation increase in the percentage of the population having a
bachelors degree or higher is associated with a 0.579 standard deviation
increase in the median household income with all other variables held
constant. This result is non-surprising and is well supported in modern
socioeconomic theories; however, it underscores the importance of
education attainment and earning potential. Higher paying industries
like technology, engineering, and so on oftentimes require at least a
bachelors degree to be considered ``qualified'' for a role.
The Gini Index, a measure of income inequality, had the largest negative
effect on median income. A one standard deviation increase the index is
associated with a -0.259 standard deviation decrease in the median
income with all other variables held constant. This can also be observed
in Figure 3.
The percent with internet access and population density variables both
have positive effects on the median income. This might reflect that
counties with more developed infrastructure and are more densely
populated have higher median incomes, which is a logical conclusion. If
there's a large, and densely, populated area, then there's an incentive
to invest in infrastructure. Interestingly though is that the percent of
the population living in an urban area has a negative effect on the
median income.
The sex ratio also has a relatively large and positive effect on median
income. As the number of males increase in the population, the median
income increases by about 0.351 standard deviations. This could be
explained by the known so-called gender pay -gap, but could also be
largely attributed to what industry someone works in. After all, this
dataset is very aggregated.
An increase in the population being composed of Black individuals is
associated with a -0.059 standard deviation decrease in median income,
while an increasing in the Hispanic or Latino population is associated
with a 0.08 standard deviation increase in the median income. In both
cases, the effects seem marginal. In both cases, the dominant reference
group is the White population.
Although the global model has strong explanatory power
(\(R_{adj}^{2} \approx 69\%\)), it is not without limitations. The
diagnostic plots (Figure 4) indicate potential violations in the
assumptions of linear regression -- name heteroskedasticity
(non-constant variance) and the distribution of the residuals being
non-normal. Figure 4 is evidence of heteroskedasticity as there is a
cone structure in the residuals. The residuals get larger for larger
predicted values of \(Y\). Furthermore, it can be observed that the
distribution of residuals has fatter right-tails and skinnier-left
tails. If the residuals were distributed normally, then the standardized
residuals would be more symmetric and it would hug the theoretical line
more tightly.
Figure 5 also suggests that higher-order terms, or at least some
transformation on the raw variables, might be warranted. The
component-residual plot for the percent with internet access variable
shows curvature in the data. This indicates a non-linear specification
of this variable might be the proper functional form. All other graphs
are relatively linear.
\begin{figure}[H]
{\centering \includegraphics{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-7-1}
}
\caption{Global OLS Diagnostics}\label{fig:unnamed-chunk-7}
\end{figure}
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-8-1}
}
\caption{Global OLS Component Regression Plots}\label{fig:unnamed-chunk-8}
\end{figure}
\newpage
On the issue of heteroskedasticity, it is possible that spatial
autocorrelation might be the factor. It's reasonable to suspect that
locations are more similar to one another. In this case, counties with
higher median income might be clustered together. To assess this, a
Moran's Test was implemented on the OLS residuals. Queen's contiguity
was used to define neighbors, and row-standardized weights were chosen
to give all neighbors equal weights. A zero policy was enabled due to
some missing counties -- namely in Connecticut.
The results of the Moran's test are:
\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for wider table
\centering
\caption{Global OLS Residual Moran's I Test Results}
\label{tab:global_ols_morans_i}
\makebox[\textwidth]{ % Makes the table span the entire text width
\begin{tabular}{lcc}
\hline
\textbf{Statistic} & \textbf{Value} \\ \hline
Moran's I Statistic & 0.3087 \\
Expectation & -0.0003 \\
Variance & 0.0001 \\
Standard Deviate & 28.675 \\
\textit{p-value} & $<$ 2.2e-16 \\
\hline
\textbf{Alternative Hypothesis} & Greater \\
\multicolumn{2}{l}{\textit{Notes:} Moran's I test under randomization.} \\
\hline
\end{tabular}
}
\end{table}
The test is significant at the level of 0.05, and so the null hypothesis
is rejected. There is sufficient evidence to suggest that positive
spatial autocorrelation exists.
Another way to look at this is to plot the response variable and its
lagged counterparts (similarly for the OLS residuals) seen in Figure
6-7:
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-10-1}
}
\caption{Moran Plot for Median Income}\label{fig:unnamed-chunk-10}
\end{figure}
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-11-1}
}
\caption{Moran Plot for Global OLS Residuals}\label{fig:unnamed-chunk-11}
\end{figure}
If spatial autocorrelation did not exist, the slope of the diagonal line
would be approximately zero. In this case, there is clearly a positive
slope, and in fact -- the slope of the line is the value of Moran's I in
Table 3.
The implication is that maybe a global OLS model is insufficient in
explaining the data generating process and that a local modeling
approach might better capture the underlying processes. This is where
MGWR comes into play.
\newpage
\subsection{\texorpdfstring{\ul{MGWR
Model}}{MGWR Model}}\label{mgwr-model}
\begin{table}[H]
\renewcommand{\arraystretch}{1.3}
\centering
\caption{MGWR Model Summary}
\label{tab:mgwr_summary}
\begin{tabular}{lccccc}
\hline
\textbf{Variable} & \textbf{Min} & \textbf{Mean} & \textbf{Median} & \textbf{Max} & \textbf{Bandwidth (95\% CI)} \\ \hline
Intercept & -0.584 & 0.007 & -0.018 & 0.949 & 44 [44, 44] \\
Gini Index (17–21) & -0.568 & -0.206 & -0.193 & 0.055 & 92 [82, 107] \\
Population Density (Log) & -0.172 & 0.040 & 0.063 & 0.196 & 588 [488, 764] \\
\% Internet Access (21) & -0.390 & 0.269 & 0.230 & 1.143 & 44 [44, 46] \\
\% Bachelor's Degree or Higher (18–22) & -0.137 & 0.471 & 0.477 & 0.971 & 52 [48, 57] \\
\% Population in Urban Area (20) & -0.071 & -0.049 & -0.050 & -0.028 & 2263 [1932, 2654] \\
Sex Ratio (Male:Female, 17–21) & -0.004 & 0.032 & 0.029 & 0.136 & 626 [488, 764] \\
Median Age (17–21) & -0.297 & 0.041 & 0.060 & 0.258 & 142 [132, 172] \\
\% Black (2017–2021) & -0.228 & -0.226 & -0.226 & -0.225 & 3098 [2378, 3098] \\
\% Hispanic or Latino (17–21) & -0.195 & 0.044 & 0.067 & 0.252 & 473 [423, 594] \\ \hline
\textbf{Metric} & \multicolumn{5}{l}{} \\ \hline
Residual Sum of Squares & \multicolumn{5}{r}{328.055} \\
Log-Likelihood & \multicolumn{5}{r}{-917.446} \\
AIC & \multicolumn{5}{r}{3023.785} \\
AICc & \multicolumn{5}{r}{3306.439} \\
BIC & \multicolumn{5}{r}{6613.743} \\
R\textsuperscript{2} & \multicolumn{5}{r}{0.894} \\
Adjusted R\textsuperscript{2} & \multicolumn{5}{r}{0.869} \\
Degree of Dependency (DoD) & \multicolumn{5}{r}{0.492} \\ \hline
\end{tabular}
\end{table}
\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for wider table
\centering
\caption{IQR (ad-hoc) Results}
\label{tab:iqr_results}
\begin{tabular}{lcccc}
\hline
\textbf{Variable} & \textbf{IQR} & \textbf{SE (Global)} & \textbf{Threshold} & \textbf{Significant} \\ \hline
Intercept & 0.3380 & 0.010 & 0.020 & True \\
Gini Index (17–21) & 0.1420 & 0.011 & 0.022 & True \\
Population Density (Log) & 0.1780 & 0.016 & 0.032 & True \\
Median Age (17–21) & 0.1290 & 0.012 & 0.024 & True \\
\% Bachelor's Degree or Higher (18–22) & 0.2470 & 0.014 & 0.028 & True \\
\% Black (2017–2021) & 0.0017 & 0.012 & 0.024 & False \\
\% Hispanic or Latino (17–21) & 0.1670 & 0.012 & 0.024 & True \\
\% Internet Access (21) & 0.2650 & 0.015 & 0.030 & True \\
\% Population in Urban Area (20) & 0.0256 & 0.018 & 0.036 & False \\
Sex Ratio (Male:Female, 17–21) & 0.0312 & 0.011 & 0.022 & True \\
\hline
\end{tabular}
\end{table}
\newpage
Table 4 provides a summary of the MGWR calibration. This local model is
able to explain about \(86\%\) of the variance in the data (adjusted
\(R^{2}\)). The IQR ad-hoc procedure was performed instead of the
recommended Monte Carlo test due to time constraints. All variables
except for the percent of the population being Black and what percent of
the population living in an urban area showed evidence for spatial
variability under this method.
The intercept (location if all other variables were homogeneous and
zero), Gini Index, percent with internet access, percent with a
bachelors degree or higher, and median age all have relatively small
bandwidths and narrow bandwidth confidence intervals when compared to
the overall number of locations, \(N = 3,100\). This indicates that the
effects of these variables are very local i.e., the effects of these
variables are not uniform across space, which a global OLS incorrectly
assumes. Refer to figures 8, 9, 11, and 12. If we were to plot these
variables' significant local parameter estimates, we'd expect to see
clusters and a lot of variation in parameter surface. Whereas more
regional and uniform effects, indicated by larger bandwidths would have
a more uniform/smooth coloring on the entire parameter surface.
Population density, sex ratio, median age, and the percent of the
population that are Hispanic or Latino have larger bandwidths than the
prior variables discussed, but they are not large enough to say they
have global effects. For these variables, we argue that these variables
have more of a regional effect. For these regional effects, the coloring
of the parameter surface will be more smooth when compared to the very
local effects, but not completely uniform which would be the case for
global effects (large bandwidths). Refer to figures 10, 13, 14, and 15.
The percentage of the population living in an urban area and what
percent of the population is Black are considered to have global
effects. Their point estimate for the bandwidth make up over \(70\%\) of
the overall number of locations. The parameter surface for these
variables are uniform, and so no clustering will be observed. These two
variables are non-significant according to the ad-hoc procedure.
The following corrected \(\alpha\)-values were used for plotting
significant local parameter estimated:
\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for a wider table
\centering
\caption{Corrected Alpha Levels for MGWR Variables}
\label{tab:variable_enps}
\begin{tabular}{lccc}
\hline
\textbf{Variable} & \textbf{ENP\_j} & \textbf{Alpha Corrected} \\ \hline
Intercept & 164.854 & 0.000303 \\
Gini Index (17–21) & 76.129 & 0.000657 \\
Population Density (Log) & 6.687 & 0.007477 \\
\% Internet Access (21) & 149.197 & 0.000335 \\
\% Bachelor's Degree or Higher (18–22) & 125.310 & 0.000399 \\
\% Population in Urban Area (20) & 1.920 & 0.026042 \\
Sex Ratio (Male:Female, 17–21) & 11.088 & 0.004509 \\
Median Age (17–21) & 48.350 & 0.001034 \\
\% Black (2017–2021) & 1.027 & 0.048685 \\
\% Hispanic or Latino (17–21) & 8.884 & 0.005628 \\
\hline
\end{tabular}
\end{table}
\newpage
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-intercept}
}
\caption{Significant Local Intercept Estimates}\label{fig:unnamed-chunk-12}
\end{figure}
If all other variables were zero and assume homogeneity, then location
plays a role in median income. Counties in south Texas and in Virginia
have higher median incomes when compared to some counties in Kansas,
Missouri, and Arkansas, and this is just location dependent.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-gini}
}
\caption{Significant Local Gini Index Estimates}\label{fig:unnamed-chunk-13}
\end{figure}
The effects of income inequality (Gini Index) is local as seen in its
small bandwidth and variation in the parameter surface. Although an
increase in income inequality is associated with a decrease in median
income all around, the effects are not the same everywhere in the United
States. For some reason, an increase in inequality is more severe in the
Great Lakes region, in the eastern Virginian counties, Maryland, and
Delaware when compared to southern California, some parts of Arizona,
and Texas.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-ln-pop-den}
}
\caption{Significant Local Population Density Estimates}\label{fig:unnamed-chunk-14}
\end{figure}
Population density appears to be more of a regional effect given its
larger bandwidth. This observation can also be seen in the coloring of
the parameter surface. Although there are obvious variations in the
surface, they're much smoother/gradual when compared to the surface for
Gini or education attainment for example.
What is interesting is that west of Illinois, an increase in population
density appears to be associated with an increase in median income while
an increase in population density appears to be associated in an
decrease in median income west of Ohio -- specifically near the US
capital and the entire northeast of the country. The effects appear to
be marginal though.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-internet-access}
}
\caption{Significant Local Percent with Internet Access Estimates}\label{fig:unnamed-chunk-15}
\end{figure}
The effects of internet access are apparently local as well given its
small bandwidth. For some reason internet access has the strongest
positive effects in Colorado. Nonetheless, having internet access is
associated with an increase in median income all around, although many
of the local estimates are non-significant.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-pct-bach-higher}
}
\caption{Significant Local Bachelors Degree or Higher Estimates}\label{fig:unnamed-chunk-16}
\end{figure}
The effects of having a bachelors degree or higher are very local given
its small bandwidths and clustering observed in the map. Educational
attainment has a much stronger positive effect on one's median income in
New Jersey, some parts of Vermont, eastern Pennsylvania, and northern
Ohio when compared to Arizona, Montana, and Idaho. The darker structure
in the Appalachia region is also interesting to observe as well. The
parameter surface for having a bachelors degree or higher is a great
example of local effects.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-sex-ratio}
}
\caption{Significant Local Sex Ratio Estimates}\label{fig:unnamed-chunk-17}
\end{figure}
The effects of having more males than females (sex ratio) are regional,
although are only really noticeable mainly in North/South Dakota,
Minnesota Texas, Louisiana, southern New Mexico, and southern Arizona.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-median-age}
}
\caption{Significant Local Median Age Estimates}\label{fig:unnamed-chunk-18}
\end{figure}
Median age is only significant in a small percentage of the population.
An increase in the median age is associated with a decrease in median
income near El Paso, Texas as well as in Maine, but it is associated
with an increase in median income near Washington DC.
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-pct-hisplat}
}
\caption{Significant Local Percent Population Hispanic or Latino Estimates}\label{fig:unnamed-chunk-19}
\end{figure}
The effects of an increasing in the Hispanic or Latino population are
regional, which makes sense. That being said, an increase in Hispanic or
Latino population is associated with a decrease in median income in the
south-west of the country, but these effects appear to be marginal. An
interesting observation is that an increase in the Hispanic or Latino
population has a strong positive effect in the rust-belt region of the
country.
\newpage
\subsection{\texorpdfstring{\ul{Comparing Global OLS
vs.~MGWR}}{Comparing Global OLS vs.~MGWR}}\label{comparing-global-ols-vs.-mgwr}
\begin{table}[H]
\renewcommand{\arraystretch}{1.3}
\centering
\caption{Comparison: Global OLS vs. MGWR}
\label{tab:ols_vs_mgwr}
\begin{tabular}{lcccc}
\hline
\textbf{Variable} & \textbf{OLS Estimate} & \textbf{MGWR Mean} & \textbf{MGWR Median} \\ \hline
Intercept & 0.000 & 0.007 & -0.018 \\
Gini Index (17–21) & -0.259 & -0.206 & -0.193 \\
Population Density (Log) & 0.179 & 0.040 & 0.063 \\
\% Internet Access (21) & 0.237 & 0.269 & 0.230 \\
\% Bachelor's Degree or Higher (18–22) & 0.578 & 0.471 & 0.477 \\
\% Population in Urban Area (20) & -0.124 & -0.049 & -0.050 \\
Sex Ratio (Male:Female, 17–21) & 0.035 & 0.032 & 0.029 \\
Median Age (17–21) & 0.010 & 0.041 & 0.060 \\
\% Black (2017–2021) & -0.060 & -0.226 & -0.226 \\
\% Hispanic or Latino (17–21) & 0.080 & 0.044 & 0.067 \\ \hline
\textbf{Metric} & \textbf{Global OLS} & \textbf{MGWR} \\ \hline
Residual Sum of Squares & 961.468 & 328.055 \\
Log-Likelihood & -2584.131 & -917.446 \\
AIC & 5188.262 & 3023.785 \\
AICc & 5190.347 & 3306.439 \\
BIC & N/A & 6613.743 \\
R\textsuperscript{2} & 0.690 & 0.894 \\
Adjusted R\textsuperscript{2} & 0.689 & 0.869 \\
Degree of Dependency (DoD) & N/A & 0.492 \\ \hline
\end{tabular}
\end{table}
Table 7 is a summary comparing the global OLS and MGWR models. The sign
of the parameter estimates are consistent between the global OLS and
MGWR framework indicating that the global model does agree with the
direction of the effects that the predictor variables have when compared
to MGWR.
MGWR has superior explanatory power when compared to global OLS
(\(R_{adj-MGWR}^{2} \approx 87\% > R_{adj-OLS}^{2} \approx 69\%\)).
Despite having far more parameters than OLS, MGWR's AICc is smaller than
OLS' AICc (\(AICc_{MGWR} \approx 3306 < AICc_{OLS} \approx 5190\)). So
while being a more complicated model, it does a better job at explaining
the data when compared to OLS, and so the trade-off is worthwhile.
\newpage
The existence of spatial autocorrelation was again explored, but now
with the MGWR generated residuals, and the same procedure for the
Moran's test was implemented on the MGWR residuals. The test failed to
reject the null hypothesis, and so there is insufficient evidence to
suggest any sort of spatial autocorrelation in the MGWR residuals. Refer
to Table 8 for details. Furthermore, figure 16 is a mapping of the OLS
and MGWR residuals. Although subtle, the clustering of the residuals has
been mediated with MGWR.
\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for wider table
\centering
\caption{MGWR Residual Moran's I Test Results}
\label{tab:mgwr_morans_i}
\makebox[\textwidth]{ % Makes the table span the entire text width
\begin{tabular}{lcc}
\hline
\textbf{Statistic} & \textbf{Value} \\ \hline
Moran's I statistic & 0.0023 \\
Expectation & -0.0003 \\
Variance & 0.0001 \\
Standard Deviate & 0.2432 \\
\textit{p-value} & 0.4039 \\
\hline
\textbf{Alternative Hypothesis} & Greater \\
\multicolumn{2}{l}{\textit{Notes:} Moran's I test under randomization.} \\
\hline
\end{tabular}
}
\end{table}
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/ols-mgwr-residual-combined}
}
\caption{Global OLS Residuals vs. MGWR Residuals}\label{fig:unnamed-chunk-20}
\end{figure}
\newpage
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-21-1}
}
\caption{Moran Plot for MGWR Residuals}\label{fig:unnamed-chunk-21}
\end{figure}
A Moran plot was also generated for the MGWR residuals (figure 17). The
slope of the line is now zero, indicating no spatial autocorrelation.
\newpage
\begin{figure}[H]
{\centering \includegraphics[width=1\linewidth]{images/nonlinearity/local-betas-vs-x}
}
\caption{MGWR Diagnostic Plots}\label{fig:unnamed-chunk-22}
\end{figure}
Figure 18 shows various plots of the local \(\hat{\beta}\)s against the
the standardized values for each predictor. Upon inspection, first-order
specifications for population density, percent with internet access, and
percent of population that are Hispanic or Latino may not be
appropriate. Polynomial terms, a transformation of the raw variables
before standardization, or interaction terms might mediate non-linearity
issues, but that is for future work.
\newpage
\section{\texorpdfstring{\ul{Conclusions/Improvements}}{Conclusions/Improvements}}\label{conclusionsimprovements}
If I were to write a real paper on something like this, I'd change my
response variable to something that reflects discretionary income more
e.g., the median household income after adjusting for housing costs.
While people in California, New York, and Washington might make a lot
more than say, someone living in Alabama, people living in California,
New York, or Washington likely have a much larger housing cost when
compared to someone living in Alabama.
A transformation of some of the predictor variables might be warranted
given some non-linearity observed in both OLS and MGWR diagnostic plots.
Finally, a thorough literature review would have been beneficial in
order to understand the unique socioeconomic profiles of the US
counties. This would have aided in better understanding the results of
MGWR and/or helped validated existing theories in the literature. A
review would have also provided a better foundation for variable
selection in the models e.g., including employment industry variables,
and so on.
Finally, a Monte Carlo test should have been implemented to detect
spatial variability.
For those who are interesting in creating the maps in R, please refer to
my Github Repo:
\url{https://github.com/loafing-cat/gis563-local-stat-model-example}.
The following are the core R libraries for mapping:
\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(sf)}
\FunctionTok{library}\NormalTok{(tigris)}
\FunctionTok{library}\NormalTok{(colorspace)}
\end{Highlighting}
\end{Shaded}
\newpage
\section{\texorpdfstring{\ul{References}}{References}}\label{references}
\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
Li, Z., \& Fotheringham, A. S. (2022). The spatial and temporal
dynamics of voter preference determinants in four U.S. presidential
elections (2008-- 2020). Transactions in GIS, 26, 1609-- 628.
\url{https://doi.org/10.1111/tgis.12880}
\item
U.S. Census Bureau. (2022). Internet Subscriptions in Household.
American Community Survey, ACS 5-Year Estimates Detailed Tables, Table
B28011. Retrieved December 7, 2024, from
\url{https://data.census.gov/table/ACSDT5Y2022.B28011?q=Telephone},
Computer, and Internet Access\&g=010XX00US\$0500000.
\item
United States Department of Agriculture, Economic Research Service.
(2022). County-Level Data Sets: Poverty estimates. Retrieved from
\url{https://www.ers.usda.gov/data-products/county-level-data-sets/county-level-data-sets-download-data/}
\item
U.S. Census Bureau. (2021). ACS DEMOGRAPHIC AND HOUSING ESTIMATES.
American Community Survey, ACS 5-Year Estimates Data Profiles, Table
DP05. Retrieved December 9, 2024, from
\url{https://data.census.gov/table/ACSDP5Y2021.DP05?q=Density&t=Populations}
and People\&g=010XX00US\$0500000.
\item
U.S. Census Bureau. (2021). GINI INDEX OF INCOME INEQUALITY. American
Community Survey, ACS 5-Year Estimates Detailed Tables, Table B19083.
Retrieved December 9, 2024, from
\url{https://data.census.gov/table/ACSDT5Y2021.B19083?q=gini&g=010XX00US$0400000}.
\item
United States Census Bureau. (2023). County-level 2020 Census Urban
and Rural Information for the U.S., Puerto Rico, and Island Areas
sorted by state and county FIPS codes {[}Data file{]}. Retrieved from
\url{https://www.census.gov/programs-surveys/geography/guidance/geo-areas/urban-rural.html}
\end{enumerate}
\end{document}