final-project-write-up-nathan-nguyen.tex

% Options for packages loaded elsewhere
\PassOptionsToPackage{unicode}{hyperref}
\PassOptionsToPackage{hyphens}{url}
\PassOptionsToPackage{dvipsnames,svgnames,x11names}{xcolor}
%
\documentclass[
]{article}
\usepackage{amsmath,amssymb}
\usepackage{iftex}
\ifPDFTeX
  \usepackage[T1]{fontenc}
  \usepackage[utf8]{inputenc}
  \usepackage{textcomp} % provide euro and other symbols
\else % if luatex or xetex
  \usepackage{unicode-math} % this also loads fontspec
  \defaultfontfeatures{Scale=MatchLowercase}
  \defaultfontfeatures[\rmfamily]{Ligatures=TeX,Scale=1}
\fi
\usepackage{lmodern}
\ifPDFTeX\else
  % xetex/luatex font selection
\fi
% Use upquote if available, for straight quotes in verbatim environments
\IfFileExists{upquote.sty}{\usepackage{upquote}}{}
\IfFileExists{microtype.sty}{% use microtype if available
  \usepackage[]{microtype}
  \UseMicrotypeSet[protrusion]{basicmath} % disable protrusion for tt fonts
}{}
\makeatletter
\@ifundefined{KOMAClassName}{% if non-KOMA class
  \IfFileExists{parskip.sty}{%
    \usepackage{parskip}
  }{% else
    \setlength{\parindent}{0pt}
    \setlength{\parskip}{6pt plus 2pt minus 1pt}}
}{% if KOMA class
  \KOMAoptions{parskip=half}}
\makeatother
\usepackage{xcolor}
\usepackage[margin=1in]{geometry}
\usepackage{color}
\usepackage{fancyvrb}
\newcommand{\VerbBar}{|}
\newcommand{\VERB}{\Verb[commandchars=\\\{\}]}
\DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
% Add ',fontsize=\small' for more characters per line
\usepackage{framed}
\definecolor{shadecolor}{RGB}{248,248,248}
\newenvironment{Shaded}{\begin{snugshade}}{\end{snugshade}}
\newcommand{\AlertTok}[1]{\textcolor[rgb]{0.94,0.16,0.16}{#1}}
\newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\BuiltInTok}[1]{#1}
\newcommand{\CharTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\CommentTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{#1}}
\newcommand{\DecValTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\ErrorTok}[1]{\textcolor[rgb]{0.64,0.00,0.00}{\textbf{#1}}}
\newcommand{\ExtensionTok}[1]{#1}
\newcommand{\FloatTok}[1]{\textcolor[rgb]{0.00,0.00,0.81}{#1}}
\newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\ImportTok}[1]{#1}
\newcommand{\InformationTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.13,0.29,0.53}{\textbf{#1}}}
\newcommand{\NormalTok}[1]{#1}
\newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\OtherTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{#1}}
\newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textit{#1}}}
\newcommand{\RegionMarkerTok}[1]{#1}
\newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.81,0.36,0.00}{\textbf{#1}}}
\newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\StringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\VariableTok}[1]{\textcolor[rgb]{0.00,0.00,0.00}{#1}}
\newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.31,0.60,0.02}{#1}}
\newcommand{\WarningTok}[1]{\textcolor[rgb]{0.56,0.35,0.01}{\textbf{\textit{#1}}}}
\usepackage{graphicx}
\makeatletter
\def\maxwidth{\ifdim\Gin@nat@width>\linewidth\linewidth\else\Gin@nat@width\fi}
\def\maxheight{\ifdim\Gin@nat@height>\textheight\textheight\else\Gin@nat@height\fi}
\makeatother
% Scale images if necessary, so that they will not overflow the page
% margins by default, and it is still possible to overwrite the defaults
% using explicit options in \includegraphics[width, height, ...]{}
\setkeys{Gin}{width=\maxwidth,height=\maxheight,keepaspectratio}
% Set default figure placement to htbp
\makeatletter
\def\fps@figure{htbp}
\makeatother
\ifLuaTeX
  \usepackage{luacolor}
  \usepackage[soul]{lua-ul}
\else
  \usepackage{soul}
\fi
\setlength{\emergencystretch}{3em} % prevent overfull lines
\providecommand{\tightlist}{%
  \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
\setcounter{secnumdepth}{-\maxdimen} % remove section numbering
\usepackage{amsmath}
\usepackage{sectsty} \allsectionsfont{\centering}
\usepackage{booktabs}
\usepackage{longtable}
\usepackage{array}
\usepackage{multirow}
\usepackage{wrapfig}
\usepackage{float}
\usepackage{colortbl}
\usepackage{pdflscape}
\usepackage{tabu}
\usepackage{threeparttable}
\usepackage{threeparttablex}
\usepackage[normalem]{ulem}
\usepackage{makecell}
\usepackage{xcolor}
\ifLuaTeX
  \usepackage{selnolig}  % disable illegal ligatures
\fi
\usepackage{bookmark}
\IfFileExists{xurl.sty}{\usepackage{xurl}}{} % add URL line breaks if available
\urlstyle{same}
\hypersetup{
  pdfauthor={Nathan A. Nguyen},
  colorlinks=true,
  linkcolor={Maroon},
  filecolor={Maroon},
  citecolor={Blue},
  urlcolor={blue},
  pdfcreator={LaTeX via pandoc}}

\title{\vspace{5cm}

GIS 563

Local Statistical Modeling

Final Project}
\author{Nathan A. Nguyen}
\date{10 December 2024}

\begin{document}
\maketitle

\newpage

\section{\texorpdfstring{\ul{Introduction}}{Introduction}}\label{introduction}

This write-up serves as the second part submission for the final project
in the class.

For this project, an empirical application of MGWR was performed and
compared with a global OLS model. The response variable was the
estimated median household income in 2021, and the spatial units were
United States counties. Details about the dataset and any preprocessing
that occurred will be discussed. The methods section will cover the
models explored and the software implementation as well as what R
packages were used for mapping for those who are interested. Finally
this write-up will close with brief discussions of the results and any
room for improvements should this be a real academic project.

The model presented in this write-up will deviate slightly from the
model presented in part 1. The modifications were made to accommodate
critiques while presenting -- namely the use of poverty level as a
predictor. This variable was replaced with the percentage of the
population in a respective county that are considered in an urban area.
During the initial presentation, the Monte Carlo test was still running,
so no results were available.

Due to time constraints, and unexpected events, a Monte Carlo test for
the existence of spatial variability was not performed for the model
presented in this write-up. The Monte Carlo test for the first version
of this model did complete eventually, and it suggested that the only
variable with evidence for spatial variability was the all age poverty
levels in 2021. I included this variable initially because although
poverty is obviously associated with income, I wanted to see whether or
not the effects of poverty on median income were uniform across space or
if they changed based on geography.

That being said, a rigorous test for spatial variance will not be
provided for this second model presented.

\newpage

\section{\texorpdfstring{\ul{Data
Details}}{Data Details}}\label{data-details}

The dataset used for analysis is an amalgamation of various datasets
from the United States Census Bureau/United States Department of
Commerce, the American Community Survey, the United States Department of
Agriculture's Economic Research Service, and from a 2022 paper by
Fotheringham et al.,\textsuperscript{1} .

The area of study were United States counties, and only mainland
counties were intended to be retained in the dataset. For transparency,
most of Connecticut is missing and this issue was not observed until
after-the-fact. The missingness is attributed to the non-standardization
of FIP codes among the various datasets used. Some locations in
Connecticut are not considered true counties and are ``county
equivalents'', which was not known a-priori.

After preprocessing, the final dataset consisted of 3,100 locations.
Some R functions were defined in order to assist the preprocessing step
-- namely cleaning of FIP codes and joining all of the datasets
together.

The chosen response variable was the estimated median household income
in the year 2021 and was provided by the Census Bureau/Department of
Commerce\textsuperscript{6}. Nine predictors were included in the
models:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Gini Index (1-year estimate; 2021)\textsuperscript{5}
\item
  Population Density (natural logged)\textsuperscript{1}
\item
  Percent of Households with Internet Access (5-year estimate;
  2017-2022)\textsuperscript{2}
\item
  Percent of Population with Bachelors Degree or
  Higher\textsuperscript{3}
\item
  Percent of Population Living in an Urban Area(1-year estimate;
  2020)\textsuperscript{6}
\item
  Sex Ratio (Male-to-Female) (5-year estimate;
  2017-2022)\textsuperscript{4}
\item
  Median Age (5-year estimate; 2017-2022)\textsuperscript{4}
\item
  Percent Population that is Black (5-year estimate;
  2017-2022)\textsuperscript{4}
\item
  Percent Population that is Hispanic or Latino (5-year estimate;
  2017-2022)\textsuperscript{4}
\end{enumerate}

\begin{longtable}[t]{lcccc}
\caption{\label{tab:unnamed-chunk-2}Summary Statistics for Selected Variables}\\
\toprule
Variable & Min & Mean & Median & Max\\
\midrule
Median Income (21) & 25653.00 & 58741.99 & 56465.50 & 153716.00\\
Gini Index (17-21) & 0.25 & 0.45 & 0.44 & 0.73\\
Population Density (Natural Log) & -1.93 & 3.78 & 3.78 & 10.77\\
\% Internet Access (21) & 35.97 & 82.78 & 83.89 & 100.00\\
\% with Bachelor's Degree or Higher (18–22) & 0.00 & 23.44 & 20.90 & 78.90\\
\addlinespace
\% Population in Urban Area (20) & 0.00 & 35.95 & 33.41 & 100.00\\
Sex Ratio (Male:Female, 17–21) & 76.90 & 101.93 & 99.60 & 221.30\\
Median Age (17–21) & 22.40 & 41.52 & 41.30 & 68.10\\
\% Black (17–21) & 0.00 & 9.03 & 2.26 & 87.12\\
\% Hispanic or Latino (17–21) & 0.00 & 9.82 & 4.49 & 98.22\\
\bottomrule
\end{longtable}

\newpage

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/median-income21} 

}

\caption{Estimated Median Household Income (2021)}\label{fig:unnamed-chunk-3}
\end{figure}

Figure 1 shows the estimated median household income in 2021. By
inspection, there appears to be some clustering of this random variable.
For example, we see that the median household income is generally higher
in the north-east coast of the United States, indicated by the darker
coloring, when compared to the deep south and Appalachia, indicated by
the lighter shading. The west coast, generally, has higher income as
well.

This is likely due to the fact that the north-east and west-coast are
more developed regions of the country with high paying industries like
finance and technology while the regions with lower income are more
rural and have declining industries e.g., manufacturing and mining.

\newpage

Furthermore, the regions with higher median household income generally
have higher educational attainment as well. Figure 2 is evidence of
this:

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/bach-higher-18-22} 

}

\caption{Percent of Population Having Bachelors Degree or Higher}\label{fig:unnamed-chunk-4}
\end{figure}

\newpage

\section{\texorpdfstring{\ul{Methods}}{Methods}}\label{methods}

A global OLS and MGWR model was fit on the dataset and compared to one
another. The variance explained in both models were compared as well as
the AICc. All variables were standardized to have mean zero and and
variance one, and standardization was performed in R. All features in
the model are of order one, and no interaction terms were considered.

A linear model was fit in R using the \texttt{lm()} function while the
MGWR model was fit using the MGWR 2.2 GUI software. I also calibrated an
MGWR with the \texttt{mgwr} python module, but I ran into issues while
extracting some data.

Ad-hoc tests were used in place of a Monte Carlo test:\\

\[
\begin{aligned}
IQR_{k} &> 2\times SE_{k-global}
\end{aligned}
\]

Corrected \(\alpha\)-values were computed following:

\[
\begin{aligned}
\alpha_{j} &= \frac{\alpha^{*}}{ENP_{j}}
\end{aligned}
\]

where \(\alpha^{*} = 0.05\) and the \(ENP_{j}\) were obtained from the
\texttt{txt} file from the MGWR 2.2 session.

\newpage

\subsection{\texorpdfstring{\ul{Global OLS
Model}}{Global OLS Model}}\label{global-ols-model}

\begin{table}[H]
\renewcommand{\arraystretch}{1.5} % Adjust row spacing
\centering
\caption{Global OLS Results}
\label{tab:ols_results}
\begin{tabular}{lcccc}
\hline
\textbf{Variable} & \textbf{Estimate} & \textbf{Std. Error} & \textbf{t-value} & \textbf{p-value} \\ \hline
Intercept             & 5.87e-17       & 1.00e-02         & 0.000             & 1.000 \\ 
Gini Index (17–21)    & -0.259         & 0.011            & -22.595           & $<$2e-16 *** \\ 
Population Density (Log) & 0.179         & 0.016            & 11.468           & $<$2e-16 *** \\ 
\% Internet Access (21)  & 0.237         & 0.015            & 15.627           & $<$2e-16 *** \\ 
\% Bachelor's Degree or Higher (18–22) & 0.579 & 0.014   & 41.660           & $<$2e-16 *** \\ 
\% Population in Urban Area (20) & -0.124 & 0.018        & -6.994           & 3.26e-12 *** \\ 
Sex Ratio (Male:Female, 17–21) & 0.351   & 0.107          & 3.295            & 0.000997 *** \\ 
Median Age (17–21)     & 0.099         & 0.118            & 0.845            & 0.398 \\ 
\% Black (17–21)       & -0.059        & 0.012            & -4.929           & 8.70e-07 *** \\ 
\% Hispanic or Latino (17–21) & 0.080    & 0.012          & 6.975            & 3.73e-12 *** \\ 
\hline
\multicolumn{5}{l}{\textit{Residual Standard Error:} 0.5578 on 3090 degrees of freedom} \\
\multicolumn{5}{l}{\textit{Multiple R-squared:} 0.6897, \textit{Adjusted R-squared:} 0.6888} \\
\multicolumn{5}{l}{\textit{F-statistic:} 763.3 on 9 and 3090 DF, \textit{p-value:} $<$2.2e-16} \\
\hline
\multicolumn{5}{l}{\textbf{Signif. Codes:} *** $p < 0.001$, ** $p < 0.01$, * $p < 0.05$} \\
\end{tabular}
\end{table}

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-6-1} 

}

\caption{Global OLS Added Variable Plots}\label{fig:unnamed-chunk-6}
\end{figure}

A summary of the global OLS model results is contained in Table 2. The
global model is able to explain approximately \(69\%\) of the variance
of standardized median income (adjusted \(R^{2}\)). Among all
predictors, all are statistically significant at \(\alpha = 0.05\)
except for median age. A side comment is that with sufficiently large
sample sizes, any non-trivial effects will be statistically significant.
Effect sizes might be a better measure of model quality in the future.

Educational attainment had the largest positive effect on median income
as seen in Table 2 as well as the added variable plots (Figure 3). A one
standard deviation increase in the percentage of the population having a
bachelors degree or higher is associated with a 0.579 standard deviation
increase in the median household income with all other variables held
constant. This result is non-surprising and is well supported in modern
socioeconomic theories; however, it underscores the importance of
education attainment and earning potential. Higher paying industries
like technology, engineering, and so on oftentimes require at least a
bachelors degree to be considered ``qualified'' for a role.

The Gini Index, a measure of income inequality, had the largest negative
effect on median income. A one standard deviation increase the index is
associated with a -0.259 standard deviation decrease in the median
income with all other variables held constant. This can also be observed
in Figure 3.

The percent with internet access and population density variables both
have positive effects on the median income. This might reflect that
counties with more developed infrastructure and are more densely
populated have higher median incomes, which is a logical conclusion. If
there's a large, and densely, populated area, then there's an incentive
to invest in infrastructure. Interestingly though is that the percent of
the population living in an urban area has a negative effect on the
median income.

The sex ratio also has a relatively large and positive effect on median
income. As the number of males increase in the population, the median
income increases by about 0.351 standard deviations. This could be
explained by the known so-called gender pay -gap, but could also be
largely attributed to what industry someone works in. After all, this
dataset is very aggregated.

An increase in the population being composed of Black individuals is
associated with a -0.059 standard deviation decrease in median income,
while an increasing in the Hispanic or Latino population is associated
with a 0.08 standard deviation increase in the median income. In both
cases, the effects seem marginal. In both cases, the dominant reference
group is the White population.

Although the global model has strong explanatory power
(\(R_{adj}^{2} \approx 69\%\)), it is not without limitations. The
diagnostic plots (Figure 4) indicate potential violations in the
assumptions of linear regression -- name heteroskedasticity
(non-constant variance) and the distribution of the residuals being
non-normal. Figure 4 is evidence of heteroskedasticity as there is a
cone structure in the residuals. The residuals get larger for larger
predicted values of \(Y\). Furthermore, it can be observed that the
distribution of residuals has fatter right-tails and skinnier-left
tails. If the residuals were distributed normally, then the standardized
residuals would be more symmetric and it would hug the theoretical line
more tightly.

Figure 5 also suggests that higher-order terms, or at least some
transformation on the raw variables, might be warranted. The
component-residual plot for the percent with internet access variable
shows curvature in the data. This indicates a non-linear specification
of this variable might be the proper functional form. All other graphs
are relatively linear.

\begin{figure}[H]

{\centering \includegraphics{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-7-1} 

}

\caption{Global OLS Diagnostics}\label{fig:unnamed-chunk-7}
\end{figure}

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-8-1} 

}

\caption{Global OLS Component Regression Plots}\label{fig:unnamed-chunk-8}
\end{figure}

\newpage

On the issue of heteroskedasticity, it is possible that spatial
autocorrelation might be the factor. It's reasonable to suspect that
locations are more similar to one another. In this case, counties with
higher median income might be clustered together. To assess this, a
Moran's Test was implemented on the OLS residuals. Queen's contiguity
was used to define neighbors, and row-standardized weights were chosen
to give all neighbors equal weights. A zero policy was enabled due to
some missing counties -- namely in Connecticut.

The results of the Moran's test are:

\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for wider table
\centering
\caption{Global OLS Residual Moran's I Test Results}
\label{tab:global_ols_morans_i}
\makebox[\textwidth]{ % Makes the table span the entire text width
\begin{tabular}{lcc}
\hline
\textbf{Statistic} & \textbf{Value} \\ \hline
Moran's I Statistic       & 0.3087 \\ 
Expectation               & -0.0003 \\ 
Variance                  & 0.0001 \\ 
Standard Deviate          & 28.675 \\ 
\textit{p-value}          & $<$ 2.2e-16 \\ 
\hline
\textbf{Alternative Hypothesis} & Greater \\ 
\multicolumn{2}{l}{\textit{Notes:} Moran's I test under randomization.} \\
\hline
\end{tabular}
}
\end{table}

The test is significant at the level of 0.05, and so the null hypothesis
is rejected. There is sufficient evidence to suggest that positive
spatial autocorrelation exists.

Another way to look at this is to plot the response variable and its
lagged counterparts (similarly for the OLS residuals) seen in Figure
6-7:

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-10-1} 

}

\caption{Moran Plot for Median Income}\label{fig:unnamed-chunk-10}
\end{figure}

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-11-1} 

}

\caption{Moran Plot for Global OLS Residuals}\label{fig:unnamed-chunk-11}
\end{figure}

If spatial autocorrelation did not exist, the slope of the diagonal line
would be approximately zero. In this case, there is clearly a positive
slope, and in fact -- the slope of the line is the value of Moran's I in
Table 3.

The implication is that maybe a global OLS model is insufficient in
explaining the data generating process and that a local modeling
approach might better capture the underlying processes. This is where
MGWR comes into play.

\newpage

\subsection{\texorpdfstring{\ul{MGWR
Model}}{MGWR Model}}\label{mgwr-model}

\begin{table}[H]
\renewcommand{\arraystretch}{1.3}
\centering
\caption{MGWR Model Summary}
\label{tab:mgwr_summary}
\begin{tabular}{lccccc}
\hline
\textbf{Variable} & \textbf{Min} & \textbf{Mean} & \textbf{Median} & \textbf{Max} & \textbf{Bandwidth (95\% CI)} \\ \hline
Intercept                & -0.584 & 0.007 & -0.018 & 0.949 & 44 [44, 44] \\ 
Gini Index (17–21)       & -0.568 & -0.206 & -0.193 & 0.055 & 92 [82, 107] \\ 
Population Density (Log) & -0.172 & 0.040 & 0.063 & 0.196 & 588 [488, 764] \\ 
\% Internet Access (21)  & -0.390 & 0.269 & 0.230 & 1.143 & 44 [44, 46] \\ 
\% Bachelor's Degree or Higher (18–22) & -0.137 & 0.471 & 0.477 & 0.971 & 52 [48, 57] \\ 
\% Population in Urban Area (20) & -0.071 & -0.049 & -0.050 & -0.028 & 2263 [1932, 2654] \\ 
Sex Ratio (Male:Female, 17–21) & -0.004 & 0.032 & 0.029 & 0.136 & 626 [488, 764] \\ 
Median Age (17–21)       & -0.297 & 0.041 & 0.060 & 0.258 & 142 [132, 172] \\ 
\% Black (2017–2021)     & -0.228 & -0.226 & -0.226 & -0.225 & 3098 [2378, 3098] \\ 
\% Hispanic or Latino (17–21) & -0.195 & 0.044 & 0.067 & 0.252 & 473 [423, 594] \\ \hline
\textbf{Metric} & \multicolumn{5}{l}{} \\ \hline
Residual Sum of Squares  & \multicolumn{5}{r}{328.055} \\
Log-Likelihood           & \multicolumn{5}{r}{-917.446} \\
AIC                      & \multicolumn{5}{r}{3023.785} \\
AICc                     & \multicolumn{5}{r}{3306.439} \\
BIC                      & \multicolumn{5}{r}{6613.743} \\
R\textsuperscript{2}     & \multicolumn{5}{r}{0.894} \\
Adjusted R\textsuperscript{2} & \multicolumn{5}{r}{0.869} \\
Degree of Dependency (DoD) & \multicolumn{5}{r}{0.492} \\ \hline
\end{tabular}
\end{table}
\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for wider table
\centering
\caption{IQR (ad-hoc) Results}
\label{tab:iqr_results}
\begin{tabular}{lcccc}
\hline
\textbf{Variable} & \textbf{IQR} & \textbf{SE (Global)} & \textbf{Threshold} & \textbf{Significant} \\ \hline
Intercept                & 0.3380 & 0.010 & 0.020 & True \\ 
Gini Index (17–21)       & 0.1420 & 0.011 & 0.022 & True \\ 
Population Density (Log) & 0.1780 & 0.016 & 0.032 & True \\ 
Median Age (17–21)       & 0.1290 & 0.012 & 0.024 & True \\ 
\% Bachelor's Degree or Higher (18–22) & 0.2470 & 0.014 & 0.028 & True \\ 
\% Black (2017–2021)     & 0.0017 & 0.012 & 0.024 & False \\ 
\% Hispanic or Latino (17–21) & 0.1670 & 0.012 & 0.024 & True \\ 
\% Internet Access (21)  & 0.2650 & 0.015 & 0.030 & True \\ 
\% Population in Urban Area (20) & 0.0256 & 0.018 & 0.036 & False \\ 
Sex Ratio (Male:Female, 17–21) & 0.0312 & 0.011 & 0.022 & True \\ 
\hline
\end{tabular}
\end{table}

\newpage

Table 4 provides a summary of the MGWR calibration. This local model is
able to explain about \(86\%\) of the variance in the data (adjusted
\(R^{2}\)). The IQR ad-hoc procedure was performed instead of the
recommended Monte Carlo test due to time constraints. All variables
except for the percent of the population being Black and what percent of
the population living in an urban area showed evidence for spatial
variability under this method.

The intercept (location if all other variables were homogeneous and
zero), Gini Index, percent with internet access, percent with a
bachelors degree or higher, and median age all have relatively small
bandwidths and narrow bandwidth confidence intervals when compared to
the overall number of locations, \(N = 3,100\). This indicates that the
effects of these variables are very local i.e., the effects of these
variables are not uniform across space, which a global OLS incorrectly
assumes. Refer to figures 8, 9, 11, and 12. If we were to plot these
variables' significant local parameter estimates, we'd expect to see
clusters and a lot of variation in parameter surface. Whereas more
regional and uniform effects, indicated by larger bandwidths would have
a more uniform/smooth coloring on the entire parameter surface.

Population density, sex ratio, median age, and the percent of the
population that are Hispanic or Latino have larger bandwidths than the
prior variables discussed, but they are not large enough to say they
have global effects. For these variables, we argue that these variables
have more of a regional effect. For these regional effects, the coloring
of the parameter surface will be more smooth when compared to the very
local effects, but not completely uniform which would be the case for
global effects (large bandwidths). Refer to figures 10, 13, 14, and 15.

The percentage of the population living in an urban area and what
percent of the population is Black are considered to have global
effects. Their point estimate for the bandwidth make up over \(70\%\) of
the overall number of locations. The parameter surface for these
variables are uniform, and so no clustering will be observed. These two
variables are non-significant according to the ad-hoc procedure.

The following corrected \(\alpha\)-values were used for plotting
significant local parameter estimated:

\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for a wider table
\centering
\caption{Corrected Alpha Levels for MGWR Variables}
\label{tab:variable_enps}
\begin{tabular}{lccc}
\hline
\textbf{Variable} & \textbf{ENP\_j} & \textbf{Alpha Corrected} \\ \hline
Intercept                 & 164.854 & 0.000303 \\ 
Gini Index (17–21)        & 76.129  & 0.000657 \\ 
Population Density (Log)  & 6.687   & 0.007477 \\ 
\% Internet Access (21)   & 149.197 & 0.000335 \\ 
\% Bachelor's Degree or Higher (18–22) & 125.310 & 0.000399 \\ 
\% Population in Urban Area (20) & 1.920 & 0.026042 \\ 
Sex Ratio (Male:Female, 17–21) & 11.088 & 0.004509 \\ 
Median Age (17–21)        & 48.350  & 0.001034 \\ 
\% Black (2017–2021)      & 1.027   & 0.048685 \\ 
\% Hispanic or Latino (17–21) & 8.884 & 0.005628 \\ 
\hline
\end{tabular}
\end{table}

\newpage

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-intercept} 

}

\caption{Significant Local Intercept Estimates}\label{fig:unnamed-chunk-12}
\end{figure}

If all other variables were zero and assume homogeneity, then location
plays a role in median income. Counties in south Texas and in Virginia
have higher median incomes when compared to some counties in Kansas,
Missouri, and Arkansas, and this is just location dependent.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-gini} 

}

\caption{Significant Local Gini Index Estimates}\label{fig:unnamed-chunk-13}
\end{figure}

The effects of income inequality (Gini Index) is local as seen in its
small bandwidth and variation in the parameter surface. Although an
increase in income inequality is associated with a decrease in median
income all around, the effects are not the same everywhere in the United
States. For some reason, an increase in inequality is more severe in the
Great Lakes region, in the eastern Virginian counties, Maryland, and
Delaware when compared to southern California, some parts of Arizona,
and Texas.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-ln-pop-den} 

}

\caption{Significant Local Population Density Estimates}\label{fig:unnamed-chunk-14}
\end{figure}

Population density appears to be more of a regional effect given its
larger bandwidth. This observation can also be seen in the coloring of
the parameter surface. Although there are obvious variations in the
surface, they're much smoother/gradual when compared to the surface for
Gini or education attainment for example.

What is interesting is that west of Illinois, an increase in population
density appears to be associated with an increase in median income while
an increase in population density appears to be associated in an
decrease in median income west of Ohio -- specifically near the US
capital and the entire northeast of the country. The effects appear to
be marginal though.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-internet-access} 

}

\caption{Significant Local Percent with Internet Access Estimates}\label{fig:unnamed-chunk-15}
\end{figure}

The effects of internet access are apparently local as well given its
small bandwidth. For some reason internet access has the strongest
positive effects in Colorado. Nonetheless, having internet access is
associated with an increase in median income all around, although many
of the local estimates are non-significant.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-pct-bach-higher} 

}

\caption{Significant Local Bachelors Degree or Higher Estimates}\label{fig:unnamed-chunk-16}
\end{figure}

The effects of having a bachelors degree or higher are very local given
its small bandwidths and clustering observed in the map. Educational
attainment has a much stronger positive effect on one's median income in
New Jersey, some parts of Vermont, eastern Pennsylvania, and northern
Ohio when compared to Arizona, Montana, and Idaho. The darker structure
in the Appalachia region is also interesting to observe as well. The
parameter surface for having a bachelors degree or higher is a great
example of local effects.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-sex-ratio} 

}

\caption{Significant Local Sex Ratio Estimates}\label{fig:unnamed-chunk-17}
\end{figure}

The effects of having more males than females (sex ratio) are regional,
although are only really noticeable mainly in North/South Dakota,
Minnesota Texas, Louisiana, southern New Mexico, and southern Arizona.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-median-age} 

}

\caption{Significant Local Median Age Estimates}\label{fig:unnamed-chunk-18}
\end{figure}

Median age is only significant in a small percentage of the population.
An increase in the median age is associated with a decrease in median
income near El Paso, Texas as well as in Maine, but it is associated
with an increase in median income near Washington DC.

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/local-param/sig-pct-hisplat} 

}

\caption{Significant Local Percent Population Hispanic or Latino Estimates}\label{fig:unnamed-chunk-19}
\end{figure}

The effects of an increasing in the Hispanic or Latino population are
regional, which makes sense. That being said, an increase in Hispanic or
Latino population is associated with a decrease in median income in the
south-west of the country, but these effects appear to be marginal. An
interesting observation is that an increase in the Hispanic or Latino
population has a strong positive effect in the rust-belt region of the
country.

\newpage

\subsection{\texorpdfstring{\ul{Comparing Global OLS
vs.~MGWR}}{Comparing Global OLS vs.~MGWR}}\label{comparing-global-ols-vs.-mgwr}

\begin{table}[H]
\renewcommand{\arraystretch}{1.3}
\centering
\caption{Comparison: Global OLS vs. MGWR}
\label{tab:ols_vs_mgwr}
\begin{tabular}{lcccc}
\hline
\textbf{Variable} & \textbf{OLS Estimate} & \textbf{MGWR Mean} & \textbf{MGWR Median} \\ \hline
Intercept                & 0.000 & 0.007 & -0.018 \\ 
Gini Index (17–21)       & -0.259 & -0.206 & -0.193 \\ 
Population Density (Log) & 0.179 & 0.040 & 0.063 \\ 
\% Internet Access (21)  & 0.237 & 0.269 & 0.230 \\ 
\% Bachelor's Degree or Higher (18–22) & 0.578 & 0.471 & 0.477 \\ 
\% Population in Urban Area (20) & -0.124 & -0.049 & -0.050 \\ 
Sex Ratio (Male:Female, 17–21) & 0.035 & 0.032 & 0.029 \\ 
Median Age (17–21)       & 0.010 & 0.041 & 0.060 \\ 
\% Black (2017–2021)     & -0.060 & -0.226 & -0.226 \\ 
\% Hispanic or Latino (17–21) & 0.080 & 0.044 & 0.067 \\ \hline
\textbf{Metric} & \textbf{Global OLS} & \textbf{MGWR} \\ \hline
Residual Sum of Squares  & 961.468 & 328.055 \\
Log-Likelihood           & -2584.131 & -917.446 \\
AIC                      & 5188.262 & 3023.785 \\
AICc                     & 5190.347 & 3306.439 \\
BIC                      & N/A & 6613.743 \\
R\textsuperscript{2}     & 0.690 & 0.894 \\
Adjusted R\textsuperscript{2} & 0.689 & 0.869 \\
Degree of Dependency (DoD) & N/A & 0.492 \\ \hline
\end{tabular}
\end{table}

Table 7 is a summary comparing the global OLS and MGWR models. The sign
of the parameter estimates are consistent between the global OLS and
MGWR framework indicating that the global model does agree with the
direction of the effects that the predictor variables have when compared
to MGWR.

MGWR has superior explanatory power when compared to global OLS
(\(R_{adj-MGWR}^{2} \approx 87\% > R_{adj-OLS}^{2} \approx 69\%\)).
Despite having far more parameters than OLS, MGWR's AICc is smaller than
OLS' AICc (\(AICc_{MGWR} \approx 3306 < AICc_{OLS} \approx 5190\)). So
while being a more complicated model, it does a better job at explaining
the data when compared to OLS, and so the trade-off is worthwhile.

\newpage

The existence of spatial autocorrelation was again explored, but now
with the MGWR generated residuals, and the same procedure for the
Moran's test was implemented on the MGWR residuals. The test failed to
reject the null hypothesis, and so there is insufficient evidence to
suggest any sort of spatial autocorrelation in the MGWR residuals. Refer
to Table 8 for details. Furthermore, figure 16 is a mapping of the OLS
and MGWR residuals. Although subtle, the clustering of the residuals has
been mediated with MGWR.

\begin{table}[H]
\renewcommand{\arraystretch}{1.3} % Adjust row spacing
\setlength{\tabcolsep}{12pt} % Adjust column spacing for wider table
\centering
\caption{MGWR Residual Moran's I Test Results}
\label{tab:mgwr_morans_i}
\makebox[\textwidth]{ % Makes the table span the entire text width
\begin{tabular}{lcc}
\hline
\textbf{Statistic} & \textbf{Value} \\ \hline
Moran's I statistic       & 0.0023 \\ 
Expectation               & -0.0003 \\ 
Variance                  & 0.0001 \\ 
Standard Deviate          & 0.2432 \\ 
\textit{p-value}          & 0.4039 \\ 
\hline
\textbf{Alternative Hypothesis} & Greater \\ 
\multicolumn{2}{l}{\textit{Notes:} Moran's I test under randomization.} \\
\hline
\end{tabular}
}
\end{table}

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/ols-mgwr-residual-combined} 

}

\caption{Global OLS Residuals vs. MGWR Residuals}\label{fig:unnamed-chunk-20}
\end{figure}

\newpage

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{final-project-write-up-nathan-nguyen_files/figure-latex/unnamed-chunk-21-1} 

}

\caption{Moran Plot for MGWR Residuals}\label{fig:unnamed-chunk-21}
\end{figure}

A Moran plot was also generated for the MGWR residuals (figure 17). The
slope of the line is now zero, indicating no spatial autocorrelation.

\newpage

\begin{figure}[H]

{\centering \includegraphics[width=1\linewidth]{images/nonlinearity/local-betas-vs-x} 

}

\caption{MGWR Diagnostic Plots}\label{fig:unnamed-chunk-22}
\end{figure}

Figure 18 shows various plots of the local \(\hat{\beta}\)s against the
the standardized values for each predictor. Upon inspection, first-order
specifications for population density, percent with internet access, and
percent of population that are Hispanic or Latino may not be
appropriate. Polynomial terms, a transformation of the raw variables
before standardization, or interaction terms might mediate non-linearity
issues, but that is for future work.

\newpage

\section{\texorpdfstring{\ul{Conclusions/Improvements}}{Conclusions/Improvements}}\label{conclusionsimprovements}

If I were to write a real paper on something like this, I'd change my
response variable to something that reflects discretionary income more
e.g., the median household income after adjusting for housing costs.
While people in California, New York, and Washington might make a lot
more than say, someone living in Alabama, people living in California,
New York, or Washington likely have a much larger housing cost when
compared to someone living in Alabama.

A transformation of some of the predictor variables might be warranted
given some non-linearity observed in both OLS and MGWR diagnostic plots.

Finally, a thorough literature review would have been beneficial in
order to understand the unique socioeconomic profiles of the US
counties. This would have aided in better understanding the results of
MGWR and/or helped validated existing theories in the literature. A
review would have also provided a better foundation for variable
selection in the models e.g., including employment industry variables,
and so on.

Finally, a Monte Carlo test should have been implemented to detect
spatial variability.

For those who are interesting in creating the maps in R, please refer to
my Github Repo:
\url{https://github.com/loafing-cat/gis563-local-stat-model-example}.

The following are the core R libraries for mapping:

\begin{Shaded}
\begin{Highlighting}[]
\FunctionTok{library}\NormalTok{(tidyverse)}
\FunctionTok{library}\NormalTok{(sf)}
\FunctionTok{library}\NormalTok{(tigris)}
\FunctionTok{library}\NormalTok{(colorspace)}
\end{Highlighting}
\end{Shaded}

\newpage

\section{\texorpdfstring{\ul{References}}{References}}\label{references}

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\item
  Li, Z., \& Fotheringham, A. S. (2022). The spatial and temporal
  dynamics of voter preference determinants in four U.S. presidential
  elections (2008-- 2020). Transactions in GIS, 26, 1609-- 628.
  \url{https://doi.org/10.1111/tgis.12880}
\item
  U.S. Census Bureau. (2022). Internet Subscriptions in Household.
  American Community Survey, ACS 5-Year Estimates Detailed Tables, Table
  B28011. Retrieved December 7, 2024, from
  \url{https://data.census.gov/table/ACSDT5Y2022.B28011?q=Telephone},
  Computer, and Internet Access\&g=010XX00US\$0500000.
\item
  United States Department of Agriculture, Economic Research Service.
  (2022). County-Level Data Sets: Poverty estimates. Retrieved from
  \url{https://www.ers.usda.gov/data-products/county-level-data-sets/county-level-data-sets-download-data/}
\item
  U.S. Census Bureau. (2021). ACS DEMOGRAPHIC AND HOUSING ESTIMATES.
  American Community Survey, ACS 5-Year Estimates Data Profiles, Table
  DP05. Retrieved December 9, 2024, from
  \url{https://data.census.gov/table/ACSDP5Y2021.DP05?q=Density&t=Populations}
  and People\&g=010XX00US\$0500000.
\item
  U.S. Census Bureau. (2021). GINI INDEX OF INCOME INEQUALITY. American
  Community Survey, ACS 5-Year Estimates Detailed Tables, Table B19083.
  Retrieved December 9, 2024, from
  \url{https://data.census.gov/table/ACSDT5Y2021.B19083?q=gini&g=010XX00US$0400000}.
\item
  United States Census Bureau. (2023). County-level 2020 Census Urban
  and Rural Information for the U.S., Puerto Rico, and Island Areas
  sorted by state and county FIPS codes {[}Data file{]}. Retrieved from
  \url{https://www.census.gov/programs-surveys/geography/guidance/geo-areas/urban-rural.html}
\end{enumerate}

\end{document}