diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9469bcf2..99aa7378 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,7 +15,7 @@ jobs: fail-fast: false matrix: version: - - '1.3' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'. + - '1.6' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'. - '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia. os: - ubuntu-latest diff --git a/Project.toml b/Project.toml index dc8a78f3..e4bdfe36 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "FixedEffectModels" uuid = "9d5cd8c9-2029-5cab-9928-427838db53e3" -version = "1.8.1" +version = "1.9.0" [deps] DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" @@ -15,15 +15,17 @@ StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" Vcov = "ec2bfdc2-55df-4fc9-b9ae-4958c2cf2486" +SnoopPrecompile = "66db9d55-30c0-4569-8b51-7e840670fc0c" [compat] -DataFrames = "0.21, 0.22, 1.0" +DataFrames = "0.21, 0.22, 1" FixedEffects = "2" -Reexport = "0.1, 0.2, 1.0" +Reexport = "0.1, 0.2, 1" +SnoopPrecompile = "1" StatsAPI = "1" StatsBase = "0.33" StatsFuns = "0.9, 1" -StatsModels = "0.6" +StatsModels = "0.7" Tables = "1" Vcov = "0.7" -julia = "1.3" +julia = "1.6" diff --git a/README.md b/README.md index cb245ec4..51751add 100755 --- a/README.md +++ b/README.md @@ -6,12 +6,10 @@ This package estimates linear models with high dimensional categorical variables The package is registered in the [`General`](https://github.com/JuliaRegistries/General) registry and so can be installed at the REPL with `] add FixedEffectModels`. ## Benchmarks -The objective of the package is similar to the Stata command [`reghdfe`](https://github.com/sergiocorreia/reghdfe) and the R function [`felm`](https://cran.r-project.org/web/packages/lfe/lfe.pdf). The package tends to be much faster than these two options. - -![benchmark](http://www.matthieugomez.com/files/fixedeffectmodels_benchmark.png) +The objective of the package is similar to the Stata command [`reghdfe`](https://github.com/sergiocorreia/reghdfe) and the R packages [`lfe`](https://cran.r-project.org/web/packages/lfe/lfe.pdf) and [`fixest`](https://lrberge.github.io/fixest/). The package is much faster than `reghdfe` or `lfe`. It also tends to be a bit faster than the more recent `fixest` (depending on the exact command). For complicated models, `FixedEffectModels` can also run on Nvidia GPUs for even faster performances (see below) -Performances are roughly similar to the newer R function [`feols`](https://cran.r-project.org/web/packages/fixest/fixest.pdf). The main difference is that `FixedEffectModels` can also run the demeaning operation on a GPU (with `method = :gpu`). +![benchmark](http://www.matthieugomez.com/files/fixedeffectmodels_benchmark.png) ## Syntax @@ -99,14 +97,13 @@ You may use [RegressionTables.jl](https://github.com/jmboehm/RegressionTables.jl ## Performances - ### MultiThreads -`FixedEffectModels` is multi-threaded. Use the option `nthreads` to select the number of threads to use in the estimation (defaults to `Threads.nthreads()`). That being said, multithreading does not usually make a big difference. +`FixedEffectModels` is multi-threaded. Use the option `nthreads` to select the number of threads to use in the estimation (defaults to `Threads.nthreads()`). -### GPU -The package has support for GPUs (Nvidia) (thanks to Paul Schrimpf). This can make the package an order of magnitude faster for complicated problems. +### Nvidia GPU +The package has support for Nvidia GPUs (thanks to Paul Schrimpf). This can make the package an order of magnitude faster for complicated problems. -To use GPU, run `using CUDA` before `using FixedEffectModels`. Then, estimate a model with `method = :gpu`. For maximum speed, set the floating point precision to `Float32` with `double_precision = false`. +If you have a Nvidia GPU, run `using CUDA` before `using FixedEffectModels`. Then, estimate a model with `method = :gpu`. For maximum speed, set the floating point precision to `Float32` with `double_precision = false`. ```julia using CUDA, FixedEffectModels diff --git a/benchmark/.sublime2Terminal.jl b/benchmark/.sublime2Terminal.jl deleted file mode 100644 index b96deafd..00000000 --- a/benchmark/.sublime2Terminal.jl +++ /dev/null @@ -1,10 +0,0 @@ -N = 800000 # number of observations -M = 40000 # number of workers -O = 5000 # number of firms -id1 = rand(1:M, N) -id2 = [rand(max(1, div(x, 8)-10):min(O, div(x, 8)+10)) for x in id1] -x1 = 5 * cos.(id1) + 5 * sin.(id2) + randn(N) -x2 = cos.(id1) + sin.(id2) + randn(N) -y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N) -df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y) -@time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2))) \ No newline at end of file diff --git a/benchmark/benchmark.csv b/benchmark/benchmark.csv index a5da65f2..b72424db 100644 --- a/benchmark/benchmark.csv +++ b/benchmark/benchmark.csv @@ -1 +1 @@ -Order,Command,Julia,R,Stata 1,simple,0.601445,1.843,1.2 2,1 hd fe,1.624446 ,14.831,15.51 3,2 hd fe,3.639817,10.626,49.38 4, 1 cluster,1.462648,9.255,11.15 5, 2 cluster,7.187382,96.958,118.67 \ No newline at end of file +Order,Command,FixedEffectModels.jl (Julia),fixest (R),lfe (R),reghdfe (Stata) 1,simple,0.35,0.317,1.843, 0.61 2,1 hd fe,0.463 ,0.704 ,14.831, 4.64 3,2 hd fe,1.00,1.297 ,10.626, 22.99 4, 1 cluster se,0.38058,0.700 ,9.255, 8.28 5, 2 clusters se,0.765,1.803,96.958, 70.44 \ No newline at end of file diff --git a/benchmark/benchmark.jl b/benchmark/benchmark.jl index 5877edb8..c5de3d15 100755 --- a/benchmark/benchmark.jl +++ b/benchmark/benchmark.jl @@ -1,5 +1,6 @@ -using DataFrames, FixedEffectModels, Random, CategoricalArrays - +using DataFrames, Random, CategoricalArrays +@time using FixedEffectModels +# 13s precompiling # Very simple setup N = 10000000 K = 100 @@ -11,17 +12,19 @@ y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N) df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y) # first time @time reg(df, @formula(y ~ x1 + x2)) -# 14s +# 3.5s @time reg(df, @formula(y ~ x1 + x2)) -# 0.582029 seconds (852 allocations: 535.311 MiB, 18.28% gc time) +# 0.497374 seconds (450 allocations: 691.441 MiB, 33.18% gc time) +@time reg(df, @formula(y ~ x1 + x2), Vcov.cluster(:id2)) +# 1.898018 seconds (7.10 M allocations: 1.220 GiB, 8.20% gc time, 4.46% compilation time) @time reg(df, @formula(y ~ x1 + x2), Vcov.cluster(:id2)) -# 0.621690 seconds (693 allocations: 768.945 MiB, 7.69% gc time) +# 0.605172 seconds (591 allocations: 768.939 MiB, 42.38% gc time) @time reg(df, @formula(y ~ x1 + x2 + fe(id1))) -# 1.143941 seconds (245.39 k allocations: 942.937 MiB, 12.93% gc time, 14.99% compilation time) +# 0.893835 seconds (1.03 k allocations: 929.130 MiB, 54.19% gc time) @time reg(df, @formula(y ~ x1 + x2 + fe(id1)), Vcov.cluster(:id1)) -# 1.242207 seconds (245.73 k allocations: 1022.348 MiB, 9.48% gc time, 14.10% compilation time) +# 1.015078 seconds (1.18 k allocations: 1008.532 MiB, 56.50% gc time) @time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2))) -# 2.255812 seconds (351.74 k allocations: 1.076 GiB, 3.98% gc time, 12.93% compilation time) +# 1.835464 seconds (4.02 k allocations: 1.057 GiB, 35.59% gc time) # More complicated setup N = 800000 # number of observations @@ -34,7 +37,7 @@ x2 = cos.(id1) + sin.(id2) + randn(N) y= 3 .* x1 .+ 5 .* x2 .+ cos.(id1) .+ cos.(id2).^2 .+ randn(N) df = DataFrame(id1 = id1, id2 = id2, x1 = x1, x2 = x2, y = y) @time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2))) -# 3.048292 seconds (422.51 k allocations: 114.317 MiB, 6.86% compilation time) +# 2.504294 seconds (75.83 k allocations: 95.525 MiB, 0.23% gc time) +# fixest @@ -48,8 +51,8 @@ X1 = rand(n) ln_y = 3 .* X1 .+ rand(n) df = DataFrame(X1 = X1, ln_y = ln_y, id1 = id1, id2 = id2, id3 = id3) @time reg(df, @formula(ln_y ~ X1 + fe(id1)), Vcov.cluster(:id1)) -# 0.869512 seconds (234.23 k allocations: 828.818 MiB, 18.95% compilation time) +# 0.543996 seconds (873 allocations: 815.677 MiB, 34.15% gc time) @time reg(df, @formula(ln_y ~ X1 + fe(id1) + fe(id2)), Vcov.cluster(:id1)) -# 2.192262 seconds (300.08 k allocations: 985.534 MiB, 4.61% gc time, 9.42% compilation time) +# 1.301908 seconds (3.03 k allocations: 968.729 MiB, 25.84% gc time) @time reg(df, @formula(ln_y ~ X1 + fe(id1) + fe(id2) + fe(id3)), Vcov.cluster(:id1)) -# 2.700051 seconds (406.80 k allocations: 1.117 GiB, 3.56% gc time, 10.41% compilation time) +# 1.658832 seconds (4.17 k allocations: 1.095 GiB, 29.78% gc time) diff --git a/benchmark/benchmark.md b/benchmark/benchmark.md index 4188f87c..3217393f 100755 --- a/benchmark/benchmark.md +++ b/benchmark/benchmark.md @@ -3,9 +3,9 @@ Code to reproduce this graph: - Julia + FixedEffectModels.jl v1.9.0 (Julia 1.9) ```julia - using DataFrames, FixedEffectModels + using DataFrames, CategoricalArrays, FixedEffectModels N = 10000000 K = 100 id1 = rand(1:(N/K), N) @@ -13,21 +13,51 @@ Code to reproduce this graph: x1 = randn(N) x2 = randn(N) y= 3 .* x1 .+ 2 .* x2 .+ sin.(id1) .+ cos.(id2).^2 .+ randn(N) - df = DataFrame(id1 = categorical(id1), id2 = categorical(id2), x1 = x1, x2 = x2, w = w, y = y) + df = DataFrame(id1 = categorical(id1), id2 = categorical(id2), x1 = x1, x2 = x2, y = y) @time reg(df, @formula(y ~ x1 + x2)) - #0.601445 seconds (1.05 k allocations: 535.311 MiB, 31.95% gc time) + # 0.338749 seconds (450 allocations: 691.441 MiB, 2.30% gc time) @time reg(df, @formula(y ~ x1 + x2 + fe(id1))) - # 1.624446 seconds (1.21 k allocations: 734.353 MiB, 17.27% gc time) + # 0.463058 seconds (1.00 k allocations: 929.129 MiB, 13.31% gc time) @time reg(df, @formula(y ~ x1 + x2 + fe(id1) + fe(id2))) - # 3.639817 seconds (1.84 k allocations: 999.675 MiB, 11.25% gc time) + # 1.006031 seconds (3.22 k allocations: 1.057 GiB, 1.68% gc time) @time reg(df, @formula(y ~ x1 + x2), Vcov.cluster(:id1)) - # 1.462648 seconds (499.30 k allocations: 690.102 MiB, 15.92% gc time) - @time reg(df, @formula(y ~ x1 + x2, Vcov.cluster(:id1, :id2))) - # 7.187382 seconds (7.02 M allocations: 2.753 GiB, 24.19% gc time) + # 0.380562 seconds (580 allocations: 771.606 MiB, 3.07% gc time) + @time reg(df, @formula(y ~ x1 + x2), Vcov.cluster(:id1, :id2)) + #0.765847 seconds (719 allocations: 1.128 GiB, 2.01% gc time) ```` - R (lfe package) + fixest v0.8.4 (R 4.2.2) + ```R + library(fixest) + N = 10000000 + K = 100 + df = data.frame( + id1 = as.factor(sample(N/K, N, replace = TRUE)), + id2 = as.factor(sample(K, N, replace = TRUE)), + x1 = runif(N), + x2 = runif(N) + ) + df[, "y"] = 3 * df[, "x1"] + 2 * df[, "x2"] + sin(as.numeric(df[, "id1"])) + cos(as.numeric(df[, "id2"])) + runif(N) + system.time(feols(y ~ x1 + x2, df)) + #> user system elapsed + #> 0.280 0.036 0.317 + system.time(feols(y ~ x1 + x2|id1, df)) + #> user system elapsed + #> 0.616 0.089 0.704 + system.time(feols(y ~ x1 + x2|id1 + id2, df)) + #> user system elapsed + #> 1.181 0.120 1.297 + system.time(feols(y ~ x1 + x2, cluster = "id1", df)) + #> user system elapsed + #> 0.630 0.071 0.700 + system.time(feols(y ~ x1 + x2, cluster = c("id1", "id2"), df)) + #> user system elapsed + #> 1.570 0.197 1.803 + ``` + + + lfe v2.8-8 (R 4.2.2) ```R library(lfe) N = 10000000 @@ -42,22 +72,22 @@ Code to reproduce this graph: system.time(felm(y ~ x1 + x2, df)) #> user system elapsed - #> 1.843 0.476 2.323 + #> 1.137 0.232 1.596 system.time(felm(y ~ x1 + x2|id1, df)) #> user system elapsed - #> 14.831 1.342 15.993 + #> 7.08 0.41 7.46 system.time(felm(y ~ x1 + x2|id1 + id2, df)) #> user system elapsed - #> 10.626 1.358 10.336 + #> 4.832 0.370 4.615 system.time(felm(y ~ x1 + x2|0|0|id1, df)) #> user system elapsed - #> 9.255 0.843 10.110 + #> 3.712 0.287 3.996 system.time(felm(y ~ x1 + x2|0|0|id1 + id2, df)) #> user system elapsed - #> 96.958 1.474 99.113 - ``` + #> 59.119 0.889 59.946 + - Stata (reghdfe version 5.2.9 06aug2018) + reghdfe version 5.6.8 03mar2019 (Stata 16.1) ``` clear all local N = 10000000 @@ -72,13 +102,13 @@ Code to reproduce this graph: set rmsg on reg y x1 x2 - #> r; t=1.20 - areg y x1 x2, a(id1) - #>r; t=15.51 + #> r; t=0.61 + reghdfe y x1 x2, a(id1) + #>r; t=4.64 reghdfe y x1 x2, a(id1 id2) - #> r; t=49.38 + #> r; t==22.99 reg y x1 x2, cl(id1) - #> r; t=11.15 + #> r; t=8.28 ivreg2 y x1 x2, cluster(id1 id2) - #> r; t=118.67 + #> r; t=70.44 ```` diff --git a/benchmark/fixedeffectmodels_benchmark.png b/benchmark/fixedeffectmodels_benchmark.png new file mode 100644 index 00000000..b79ebed1 Binary files /dev/null and b/benchmark/fixedeffectmodels_benchmark.png differ diff --git a/benchmark/result.jl b/benchmark/result.jl index 4d2c004a..07ad8502 100644 --- a/benchmark/result.jl +++ b/benchmark/result.jl @@ -1 +1 @@ -using DataFrames, CSV, Gadfly df = CSV.read("/Users/Matthieu/Dropbox/Github/FixedEffectModels.jl/benchmark/benchmark.csv") df.R = df.R ./ df.Julia df.Stata = df.Stata ./ df.Julia df.Julia = df.Julia ./ df.Julia mdf = melt(df[!, [:Command, :Julia, :R, :Stata]], :Command) mdf = rename(mdf, :variable => :Language) p = plot(mdf, x = "Language", y = "value", color = "Command", Guide.ylabel("Time (Ratio to Julia)"), Guide.xlabel("Model"), Guide.yticks(ticks= [1, 5, 10, 15])) draw(PNG("/Users/Matthieu/Dropbox/Github/FixedEffectModels.jl/benchmark/fixedeffectmodels_benchmark.png", 8inch, 5inch, dpi=300), p) \ No newline at end of file +using DataFrames, CSV, Gadfly df = CSV.read("/Users/matthieugomez/Dropbox/Github/FixedEffectModels.jl/benchmark/benchmark.csv", DataFrame) df."fixest (R)" = df."fixest (R)" ./ df."FixedEffectModels.jl (Julia)" df."lfe (R)" = df."lfe (R)" ./ df."FixedEffectModels.jl (Julia)" df."reghdfe (Stata)" = df."reghdfe (Stata)" ./ df."FixedEffectModels.jl (Julia)" df."FixedEffectModels.jl (Julia)" = df."FixedEffectModels.jl (Julia)" ./ df."FixedEffectModels.jl (Julia)" mdf = stack(df, Not([:Command, :Order])) mdf = rename(mdf, :variable => :Language) p = plot(mdf, x = "Command", y = "value", color = "Language", Guide.ylabel("Time (Ratio to Julia)"), Guide.xlabel("Command"), Scale.y_log10) draw(PNG("/Users/matthieugomez/Dropbox/Github/FixedEffectModels.jl/benchmark/fixedeffectmodels_benchmark.png", 8inch, 5inch, dpi=300), p) \ No newline at end of file diff --git a/benchmark/result.png b/benchmark/result.png deleted file mode 100644 index 88290017..00000000 Binary files a/benchmark/result.png and /dev/null differ diff --git a/src/FixedEffectModel.jl b/src/FixedEffectModel.jl index da3fddf5..a65428a4 100644 --- a/src/FixedEffectModel.jl +++ b/src/FixedEffectModel.jl @@ -18,7 +18,7 @@ struct FixedEffectModel <: RegressionModel coefnames::Vector # Name of coefficients - yname::Union{String, Symbol} # Name of dependent variable + responsename::Union{String, Symbol} # Name of dependent variable formula::FormulaTerm # Original formula formula_schema::FormulaTerm # Schema for predict contrasts::Dict @@ -52,7 +52,7 @@ has_fe(m::FixedEffectModel) = has_fe(m.formula) StatsAPI.coef(m::FixedEffectModel) = m.coef StatsAPI.coefnames(m::FixedEffectModel) = m.coefnames -StatsAPI.responsename(m::FixedEffectModel) = m.yname +StatsAPI.responsename(m::FixedEffectModel) = m.responsename StatsAPI.vcov(m::FixedEffectModel) = m.vcov StatsAPI.nobs(m::FixedEffectModel) = m.nobs StatsAPI.dof(m::FixedEffectModel) = m.dof @@ -63,7 +63,7 @@ StatsAPI.islinear(m::FixedEffectModel) = true StatsAPI.deviance(m::FixedEffectModel) = m.tss StatsAPI.rss(m::FixedEffectModel) = m.rss StatsAPI.mss(m::FixedEffectModel) = deviance(m) - rss(m) - +StatsModels.formula(m::FixedEffectModel) = m.formula_schema function StatsAPI.confint(m::FixedEffectModel; level::Real = 0.95) scale = tdistinvcdf(StatsAPI.dof_residual(m), 1 - (1 - level) / 2) @@ -72,17 +72,22 @@ function StatsAPI.confint(m::FixedEffectModel; level::Real = 0.95) end # predict, residuals, modelresponse -function StatsAPI.predict(m::FixedEffectModel, t) - # Require DataFrame input as we are using leftjoin and select from DataFrames here - # Make sure fes are saved - if has_fe(m) - throw("To predict in a fixed effect regression, run `reg` with the option save = true, and then access predicted values using `fe().") - end - ct = StatsModels.columntable(t) - cols, nonmissings = StatsModels.missing_omit(ct, MatrixTerm(m.formula_schema.rhs)) + + +function StatsAPI.predict(m::FixedEffectModel, data) + Tables.istable(data) || + throw(ArgumentError("expected second argument to be a Table, got $(typeof(data))")) + has_fe(m) && + throw("To predict for a model with high-dimensional fixed effects, run `reg` with the option save = true, and then access predicted values using `fe().") + cdata = StatsModels.columntable(data) + cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs) Xnew = modelmatrix(m.formula_schema, cols) - out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(ct))) - out[nonmissings] = Xnew * m.coef + if all(nonmissings) + out = Xnew * m.coef + else + out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(cdata))) + out[nonmissings] = Xnew * m.coef + end # Join FE estimates onto data and sum row-wise # This code does not work propertly with missing or with interacted fixed effect, so deleted @@ -96,29 +101,29 @@ function StatsAPI.predict(m::FixedEffectModel, t) return out end -function StatsAPI.residuals(m::FixedEffectModel, t) - if has_fe(m) - throw("To access residuals in a fixed effect regression, run `reg` with the option save = :residuals, and then access residuals with `residuals()`") +function StatsAPI.residuals(m::FixedEffectModel, data) + Tables.istable(data) || + throw(ArgumentError("expected second argument to be a Table, got $(typeof(data))")) + has_fe(m) && + throw("To access residuals for a model with high-dimensional fixed effects, run `reg` with the option save = :residuals, and then access residuals with `residuals()`.") + cdata = StatsModels.columntable(data) + cols, nonmissings = StatsModels.missing_omit(cdata, m.formula_schema.rhs) + Xnew = modelmatrix(m.formula_schema, cols) + y = response(m.formula_schema, cdata) + if all(nonmissings) + out = y - Xnew * m.coef else - ct = StatsModels.columntable(t) - cols, nonmissings = StatsModels.missing_omit(ct, MatrixTerm(m.formula_schema.rhs)) - Xnew = modelmatrix(m.formula_schema, cols) - y = response(m.formula_schema, ct) - if all(nonmissings) - out = y - Xnew * m.coef - else - out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(ct))) - out[nonmissings] = y - Xnew * m.coef - end - return out + out = Vector{Union{Float64, Missing}}(missing, length(Tables.rows(cdata))) + out[nonmissings] = y - Xnew * m.coef end + return out end function StatsAPI.residuals(m::FixedEffectModel) if m.residuals === nothing has_fe(m) && throw("To access residuals in a fixed effect regression, run `reg` with the option save = :residuals, and then access residuals with `residuals()`") - !has_fe(m) && throw("To access residuals, use residuals(x, t) where t is a Table") + !has_fe(m) && throw("To access residuals, use residuals(m, data) where `m` is an estimated FixedEffectModel and `data` is a Table") end m.residuals end @@ -159,7 +164,7 @@ function StatsAPI.coeftable(m::FixedEffectModel; level = 0.95) tt = cc ./ se CoefTable( hcat(cc, se, tt, fdistccdf.(Ref(1), Ref(StatsAPI.dof_residual(m)), abs2.(tt)), conf_int[:, 1:2]), - ["Estimate","Std.Error","t value", "Pr(>|t|)", "Lower 95%", "Upper 95%" ], + ["Estimate","Std. Error","t-stat", "Pr(>|t|)", "Lower 95%", "Upper 95%" ], ["$(coefnms[i])" for i = 1:length(cc)], 4) end @@ -170,133 +175,95 @@ end ## ############################################################################## -function title(m::FixedEffectModel) - iv = has_iv(m) - fe = has_fe(m) - if !iv & !fe - return "Linear Model" - elseif iv & !fe - return "IV Model" - elseif !iv & fe - return "Fixed Effect Model" - elseif iv & fe - return "IV Fixed Effect Model" - end -end - -format_scientific(x) = @sprintf("%.3f", x) - function top(m::FixedEffectModel) out = [ "Number of obs" sprint(show, nobs(m), context = :compact => true); "Degrees of freedom" sprint(show, dof(m), context = :compact => true); - "R2" format_scientific(r2(m)); - "R2 Adjusted" format_scientific(adjr2(m)); - "F-Stat" sprint(show, m.F, context = :compact => true); - "p-value" format_scientific(m.p); + "R²" @sprintf("%.3f",r2(m)); + "R² adjusted" @sprintf("%.3f",adjr2(m)); + "F-statistic" sprint(show, m.F, context = :compact => true); + "P-value" @sprintf("%.3f",m.p); ] if has_iv(m) out = vcat(out, - ["F-Stat (First Stage)" sprint(show, m.F_kp, context = :compact => true); - "p-value (First Stage)" format_scientific(m.p_kp); + [ + "F-statistic (first stage)" sprint(show, m.F_kp, context = :compact => true); + "P-value (first stage)" @sprintf("%.3f",m.p_kp); ]) end if has_fe(m) out = vcat(out, - ["R2 within" format_scientific(m.r2_within); - "Iterations" sprint(show, m.iterations, context = :compact => true); + [ + "R² within" @sprintf("%.3f",m.r2_within); + "Iterations" sprint(show, m.iterations, context = :compact => true); ]) end return out end + +import StatsBase: NoQuote, PValue function Base.show(io::IO, m::FixedEffectModel) - ctitle = title(m) - ctop = top(m) - cc = coef(m) - se = stderror(m) - yname = responsename(m) - coefnms = coefnames(m) - conf_int = confint(m) - # put (intercept) last - if !isempty(coefnms) && ((coefnms[1] == Symbol("(Intercept)")) || (coefnms[1] == "(Intercept)")) - newindex = vcat(2:length(cc), 1) - cc = cc[newindex] - se = se[newindex] - conf_int = conf_int[newindex, :] - coefnms = coefnms[newindex] - end - tt = cc ./ se - mat = hcat(cc, se, tt, fdistccdf.(Ref(1), Ref(StatsAPI.dof_residual(m)), abs2.(tt)), conf_int[:, 1:2]) - nr, nc = size(mat) - colnms = ["Estimate","Std.Error","t value", "Pr(>|t|)", "Lower 95%", "Upper 95%"] - rownms = ["$(coefnms[i])" for i = 1:length(cc)] - pvc = 4 - # print + ct = coeftable(m) + #copied from show(iio,cf::Coeftable) + cols = ct.cols; rownms = ct.rownms; colnms = ct.colnms; + nc = length(cols) + nr = length(cols[1]) if length(rownms) == 0 - rownms = AbstractString[lpad("[$i]",floor(Integer, log10(nr))+3) for i in 1:nr] - end - if length(rownms) > 0 - rnwidth = max(4, maximum(length(nm) for nm in rownms) + 2, length(yname) + 2) - else - # if only intercept, rownms is empty collection, so previous would return error - rnwidth = 4 + rownms = [lpad("[$i]",floor(Integer, log10(nr))+3) for i in 1:nr] end - rownms = [rpad(nm,rnwidth-1) * "|" for nm in rownms] - widths = [length(cn)::Int for cn in colnms] - str = [sprint(show, mat[i,j]; context=:compact => true) for i in 1:nr, j in 1:nc] - if pvc != 0 # format the p-values column - for i in 1:nr - str[i, pvc] = format_scientific(mat[i, pvc]) - end + mat = [j == 1 ? NoQuote(rownms[i]) : + j-1 == ct.pvalcol ? NoQuote(sprint(show, PValue(cols[j-1][i]))) : + j-1 in ct.teststatcol ? TestStat(cols[j-1][i]) : + cols[j-1][i] isa AbstractString ? NoQuote(cols[j-1][i]) : cols[j-1][i] + for i in 1:nr, j in 1:nc+1] + io = IOContext(io, :compact=>true, :limit=>false) + A = Base.alignment(io, mat, 1:size(mat, 1), 1:size(mat, 2), + typemax(Int), typemax(Int), 3) + nmswidths = pushfirst!(length.(colnms), 0) + A = [nmswidths[i] > sum(A[i]) ? (A[i][1]+nmswidths[i]-sum(A[i]), A[i][2]) : A[i] + for i in 1:length(A)] + totwidth = sum(sum.(A)) + 2 * (length(A) - 1) + + + #intert my stuff which requires totwidth + ctitle = string(typeof(m)) + halfwidth = div(totwidth - length(ctitle), 2) + print(io, " " ^ halfwidth * ctitle * " " ^ halfwidth) + ctop = top(m) + for i in 1:size(ctop, 1) + ctop[i, 1] = ctop[i, 1] * ":" end - for j in 1:nc - for i in 1:nr - lij = length(str[i, j]) - if lij > widths[j] - widths[j] = lij - end + println(io, '\n', repeat('=', totwidth)) + halfwidth = div(totwidth, 2) - 1 + interwidth = 2 + mod(totwidth, 2) + for i in 1:(div(size(ctop, 1) - 1, 2)+1) + print(io, ctop[2*i-1, 1]) + print(io, lpad(ctop[2*i-1, 2], halfwidth - length(ctop[2*i-1, 1]))) + print(io, " " ^interwidth) + if size(ctop, 1) >= 2*i + print(io, ctop[2*i, 1]) + print(io, lpad(ctop[2*i, 2], halfwidth - length(ctop[2*i, 1]))) end + println(io) end - widths .+= 1 - totalwidth = sum(widths) + rnwidth - if length(ctitle) > 0 - halfwidth = div(totalwidth - length(ctitle), 2) - println(io, " " ^ halfwidth * string(ctitle) * " " ^ halfwidth) + + # rest of coeftable code + println(io, repeat('=', totwidth)) + print(io, repeat(' ', sum(A[1]))) + for j in 1:length(colnms) + print(io, " ", lpad(colnms[j], sum(A[j+1]))) end - if length(ctop) > 0 - for i in 1:size(ctop, 1) - ctop[i, 1] = ctop[i, 1] * ":" - end - println(io, "=" ^totalwidth) - halfwidth = div(totalwidth, 2) - 1 - interwidth = 2 + mod(totalwidth, 2) - for i in 1:(div(size(ctop, 1) - 1, 2)+1) - print(io, ctop[2*i-1, 1]) - print(io, lpad(ctop[2*i-1, 2], halfwidth - length(ctop[2*i-1, 1]))) - print(io, " " ^interwidth) - if size(ctop, 1) >= 2*i - print(io, ctop[2*i, 1]) - print(io, lpad(ctop[2*i, 2], halfwidth - length(ctop[2*i, 1]))) - end - println(io) - end + println(io, '\n', repeat('─', totwidth)) + for i in 1:size(mat, 1) + Base.print_matrix_row(io, mat, A, i, 1:size(mat, 2), " ") + i != size(mat, 1) && println(io) end - println(io,"=" ^totalwidth) - println(io, rpad(string(yname), rnwidth-1) * "|" * - join([lpad(string(colnms[i]), widths[i]) for i = 1:nc], "")) - println(io,"-" ^totalwidth) - for i in 1:nr - print(io, rownms[i]) - for j in 1:nc - print(io, lpad(str[i,j],widths[j])) - end - println(io) - end - println(io,"=" ^totalwidth) + println(io, '\n', repeat('=', totwidth)) + nothing end - + ############################################################################## ## diff --git a/src/FixedEffectModels.jl b/src/FixedEffectModels.jl index 8152164e..88bf5f7f 100644 --- a/src/FixedEffectModels.jl +++ b/src/FixedEffectModels.jl @@ -1,16 +1,13 @@ module FixedEffectModels -# slows down tss -#if isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@optlevel")) -# @eval Base.Experimental.@optlevel 1 -#end using DataFrames using FixedEffects using LinearAlgebra using Printf using Reexport +using SnoopPrecompile using Statistics using StatsAPI using StatsBase @@ -38,14 +35,15 @@ has_iv, has_fe, Vcov -if ccall(:jl_generating_output, Cint, ()) == 1 # if we're precompiling the package - let - df = DataFrame(x1 = [1.0, 2.0, 3.0, 4.0], x2 = [1.0, 2.0, 4.0, 4.0], y = [3.0, 4.0, 4.0, 5.0], id = [1, 1, 2, 2]) - reg(df, @formula(y ~ x1 + x2)) - reg(df, @formula(y ~ x1 + fe(id))) - reg(df, @formula(y ~ x1), Vcov.cluster(:id)) - end + +@precompile_all_calls begin + df = DataFrame(x1 = [1.0, 2.0, 3.0, 4.0], x2 = [1.0, 2.0, 4.0, 4.0], y = [3.0, 4.0, 4.0, 5.0], id = [1, 1, 2, 2]) + reg(df, @formula(y ~ x1 + x2)) + reg(df, @formula(y ~ x1 + fe(id))) + reg(df, @formula(y ~ 1), Vcov.cluster(:id)) end + + end diff --git a/src/fit.jl b/src/fit.jl index 4dbd5c34..97bde38e 100644 --- a/src/fit.jl +++ b/src/fit.jl @@ -46,22 +46,22 @@ using RDatasets, FixedEffectModels df = dataset("plm", "Cigar") fit(FixedEffectModel, @formula(Sales ~ NDI + fe(State) + fe(State)&Year), df) """ -function reg(@nospecialize(df), - @nospecialize(formula::FormulaTerm), - @nospecialize(vcov::CovarianceEstimator = Vcov.simple()); - @nospecialize(contrasts::Dict = Dict{Symbol, Any}()), - @nospecialize(weights::Union{Symbol, Nothing} = nothing), - @nospecialize(save::Union{Bool, Symbol} = :none), - @nospecialize(method::Symbol = :cpu), - @nospecialize(nthreads::Integer = method == :cpu ? Threads.nthreads() : 256), - @nospecialize(double_precision::Bool = true), - @nospecialize(tol::Real = 1e-6), - @nospecialize(maxiter::Integer = 10000), - @nospecialize(drop_singletons::Bool = true), - @nospecialize(progress_bar::Bool = true), - @nospecialize(dof_add::Integer = 0), - @nospecialize(subset::Union{Nothing, AbstractVector} = nothing), - @nospecialize(first_stage::Bool = true)) +function reg(df, + formula::FormulaTerm, + vcov::CovarianceEstimator = Vcov.simple(); + contrasts::Dict = Dict{Symbol, Any}(), + weights::Union{Symbol, Nothing} = nothing, + save::Union{Bool, Symbol} = :none, + method::Symbol = :cpu, + nthreads::Integer = method == :cpu ? Threads.nthreads() : 256, + double_precision::Bool = true, + tol::Real = 1e-6, + maxiter::Integer = 10000, + drop_singletons::Bool = true, + progress_bar::Bool = true, + dof_add::Integer = 0, + subset::Union{Nothing, AbstractVector} = nothing, + first_stage::Bool = true) StatsAPI.fit(FixedEffectModel, formula, df, vcov; contrasts = contrasts, weights = weights, save = save, method = method, nthreads = nthreads, double_precision = double_precision, tol = tol, maxiter = maxiter, drop_singletons = drop_singletons, progress_bar = progress_bar, dof_add = dof_add, subset = subset, first_stage = first_stage) end @@ -230,7 +230,7 @@ function StatsAPI.fit(::Type{FixedEffectModel}, all(isfinite, Z) || throw("Some observations for the instrumental variables are infinite") # modify formula to use in predict - formula_schema = FormulaTerm(formula_schema.lhs, (tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...))) + formula_schema = FormulaTerm(formula_schema.lhs, MatrixTerm(tuple(eachterm(formula_schema.rhs)..., (term for term in eachterm(formula_endo_schema.rhs) if term != ConstantTerm(0))...))) end # compute tss now before potentially demeaning y diff --git a/src/partial_out.jl b/src/partial_out.jl index 4f258345..933be079 100644 --- a/src/partial_out.jl +++ b/src/partial_out.jl @@ -9,7 +9,7 @@ Partial out variables in a Dataframe * `maxiter::Integer`: Maximum number of iterations * `double_precision::Bool`: Should the demeaning operation use Float64 rather than Float32? Default to true. * `tol::Real`: Tolerance -* `align::Bool`: Should the returned DataFrame align with the original DataFrame in case of missing values? Default to true +* `align::Bool`: Should the returned DataFrame align with the original DataFrame in case of missing values? Default to true. ### Returns * `::DataFrame`: a dataframe with as many columns as there are dependent variables and as many rows as the original dataframe. diff --git a/src/utils/formula.jl b/src/utils/formula.jl index 9d57a02f..f3240734 100644 --- a/src/utils/formula.jl +++ b/src/utils/formula.jl @@ -6,15 +6,7 @@ eachterm(@nospecialize(x::AbstractTerm)) = (x,) eachterm(@nospecialize(x::NTuple{N, AbstractTerm})) where {N} = x -TermOrTerms = Union{AbstractTerm, NTuple{N, AbstractTerm} where N} -hasintercept(@nospecialize(t::TermOrTerms)) = - InterceptTerm{true}() ∈ terms(t) || - ConstantTerm(1) ∈ terms(t) -omitsintercept(@nospecialize(f::FormulaTerm)) = omitsintercept(f.rhs) -omitsintercept(@nospecialize(t::TermOrTerms)) = - InterceptTerm{false}() ∈ terms(t) || - ConstantTerm(0) ∈ terms(t) || - ConstantTerm(-1) ∈ terms(t) + ############################################################################## ## ## Parse IV @@ -48,18 +40,19 @@ struct FixedEffectTerm <: AbstractTerm x::Symbol end StatsModels.termvars(t::FixedEffectTerm) = [t.x] -fe(x::Term) = FixedEffectTerm(Symbol(x)) +fe(x::Term) = fe(Symbol(x)) fe(s::Symbol) = FixedEffectTerm(s) has_fe(::FixedEffectTerm) = true has_fe(::FunctionTerm{typeof(fe)}) = true -has_fe(t::InteractionTerm) = any(has_fe(x) for x in t.terms) +has_fe(@nospecialize(t::InteractionTerm)) = any(has_fe(x) for x in t.terms) has_fe(::AbstractTerm) = false + has_fe(@nospecialize(t::FormulaTerm)) = any(has_fe(x) for x in eachterm(t.rhs)) fesymbol(t::FixedEffectTerm) = t.x -fesymbol(t::FunctionTerm{typeof(fe)}) = Symbol(t.args_parsed[1]) +fesymbol(t::FunctionTerm{typeof(fe)}) = Symbol(t.args[1]) """ parse_fixedeffect(data, formula::FormulaTerm) diff --git a/test/Project.toml b/test/Project.toml index 6168988c..8b7bc29a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -11,7 +11,7 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [compat] CategoricalArrays = "0.10" -CSV = "0.8" -CUDA = "1, 2, 3" +CSV = "0.8, 0.9, 0.10" +CUDA = "1, 2, 3, 4" DataFrames = "0.21, 0.22, 1" FixedEffects = "2" diff --git a/test/fit.jl b/test/fit.jl index 3a58c90a..fb3885e8 100644 --- a/test/fit.jl +++ b/test/fit.jl @@ -1,690 +1,705 @@ using CUDA, FixedEffectModels, CategoricalArrays, CSV, DataFrames, Test, LinearAlgebra -df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) -df.StateC = categorical(df.State) -df.YearC = categorical(df.Year) -############################################################################## -## -## coefficients -## -############################################################################## - -# simple -m = @formula Sales ~ Price -x = reg(df, m) -@test coef(x) ≈ [139.73446,-0.22974] atol = 1e-4 -m = @formula Sales ~ Price -x = reg(df, m, weights = :Pop) -@test coef(x) ≈ [137.72495428982756,-0.23738] atol = 1e-4 - -df.SalesInt = round.(Int64, df.Sales) -m = @formula SalesInt ~ Price -x = reg(df, m) -@test coef(x) ≈ [139.72674,-0.2296683205] atol = 1e-4 - - -# absorb -m = @formula Sales ~ Price + fe(State) -x = reg(df, m) -@test coef(x) ≈ [-0.20984] atol = 1e-4 -@test x.iterations == 1 - -m = @formula Sales ~ Price + fe(State) + fe(Year) -x = reg(df, m) -@test coef(x) ≈ [-1.08471] atol = 1e-4 -m = @formula Sales ~ Price + fe(State) + fe(State)*Year -x = reg(df, m) -@test coef(x) ≈ [-0.53470, 0.0] atol = 1e-4 -m = @formula Sales ~ Price + fe(State)*Year -x = reg(df, m) -@test coef(x) ≈ [-0.53470, 0.0] atol = 1e-4 - -#@test isempty(coef(reg(df, @formula(Sales ~ 0), @fe(State*Price)))) -df.mState = div.(df.State, 10) -m = @formula Sales ~ Price + fe(mState)&fe(Year) -x = reg(df, m) -@test coef(x) ≈ [-1.44255] atol = 1e-4 - -m = @formula Sales ~ Price + fe(State)&Year -x = reg(df, m) -@test coef(x) ≈ [13.993028174622104,-0.5804357763515606] atol = 1e-4 -m = @formula Sales ~ Price + Year&fe(State) -x = reg(df, m) -@test coef(x) ≈ [13.993028174622104,-0.5804357763515606] atol = 1e-4 -m = @formula Sales ~ 1 + Year&fe(State) -x = reg(df, m) -@test coef(x) ≈ [174.4084407796102] atol = 1e-4 - -m = @formula Sales ~ Price + fe(State)&Year + fe(Year)&State -x = reg(df, m) -@test coef(x) ≈ [51.2359,- 0.5797] atol = 1e-4 -m = @formula Sales ~ Price + NDI + fe(State)&Year + fe(Year)&State -x = reg(df, m) -@test coef(x) ≈ [-46.4464,-0.2546, -0.005563] atol = 1e-4 -m = @formula Sales ~ 0 + Price + NDI + fe(State)&Year + fe(Year)&State -x = reg(df, m) -@test coef(x) ≈ [-0.21226562244177932,-0.004775616634862829] atol = 1e-4 - - -# recheck these two below -m = @formula Sales ~ Pimin + Price&NDI&fe(State) -x = reg(df, m) -@test coef(x) ≈ [122.98713, 0.30933] atol = 1e-4 -# SSR does not work well here -m = @formula Sales ~ Pimin + (Price&NDI)*fe(State) -x = reg(df, m) -@test coef(x) ≈ [0.421406, 0.0] atol = 1e-4 - -# only one intercept -m = @formula Sales ~ 1 + fe(State) + fe(Year) -x = reg(df, m) - - - - - -# TO DO: REPORT INTERCEPT IN CASE OF FIXED EFFFECTS, LIKE STATA -df.id3 = categorical(mod.(1:size(df, 1), Ref(3))) -df.id4 = categorical(div.(1:size(df, 1), Ref(10))) - -m = @formula Sales ~ id3 -x = reg(df, m) -@test length(coef(x)) == 3 - -m = @formula Sales ~ 0 + id3 -x = reg(df, m) -@test length(coef(x)) == 3 - - -# with fixed effects it's like implicit intercept -m = @formula Sales ~ id3 + fe(id4) -x = reg(df, m) -@test length(coef(x)) == 2 - - -m = @formula Sales ~ Year + fe(State) -x = reg(df, m) - - -m = @formula Sales ~ id3&Price -x = reg(df, m) -@test length(coef(x)) == 4 -m = @formula Sales ~ id3&Price + Price -x = reg(df, m) -@test length(coef(x)) == 4 - - - - - -m = @formula Sales ~ Year + fe(State) -x = reg(df, m) - - - -m = @formula Sales ~ Year&Price + fe(State) -x = reg(df, m) - - - - -# absorb + weights -m = @formula Sales ~ Price + fe(State) -x = reg(df, m, weights = :Pop) -@test coef(x) ≈ [- 0.21741] atol = 1e-4 -m = @formula Sales ~ Price + fe(State) + fe(Year) -x = reg(df, m, weights = :Pop) -@test coef(x) ≈ [- 0.88794] atol = 1e-3 -m = @formula Sales ~ Price + fe(State) + fe(State)&Year -x = reg(df, m, weights = :Pop) -@test coef(x) ≈ [- 0.461085492] atol = 1e-4 - -# iv -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m) -@test coef(x) ≈ [138.19479,- 0.20733] atol = 1e-4 -m = @formula Sales ~ NDI + (Price ~ Pimin) -x = reg(df, m) -@test coef(x) ≈ [137.45096,0.00516,- 0.76276] atol = 1e-4 -m = @formula Sales ~ NDI + (Price ~ Pimin + Pop) -x = reg(df, m) -@test coef(x) ≈ [137.57335,0.00534,- 0.78365] atol = 1e-4 -## multiple endogeneous variables -m = @formula Sales ~ (Price + NDI ~ Pimin + Pop) -x = reg(df, m) -@test coef(x) ≈ [139.544, .8001, -.00937] atol = 1e-4 -m = @formula Sales ~ 1 + (Price + NDI ~ Pimin + Pop) -x = reg(df, m) -@test coef(x) ≈ [139.544, .8001, -.00937] atol = 1e-4 -result = [196.576, 0.00490989, -2.94019, -3.00686, -2.94903, -2.80183, -2.74789, -2.66682, -2.63855, -2.52394, -2.34751, -2.19241, -2.18707, -2.09244, -1.9691, -1.80463, -1.81865, -1.70428, -1.72925, -1.68501, -1.66007, -1.56102, -1.43582, -1.36812, -1.33677, -1.30426, -1.28094, -1.25175, -1.21438, -1.16668, -1.13033, -1.03782] - -m = @formula Sales ~ NDI + (Price&YearC ~ Pimin&YearC) -x = reg(df, m) -@test coef(x) ≈ result atol = 1e-4 - -# iv + weight -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m, weights = :Pop) -@test coef(x) ≈ [137.03637,- 0.22802] atol = 1e-4 - -# iv + weight + absorb -m = @formula Sales ~ (Price ~ Pimin) + fe(State) -x = reg(df, m) -@test coef(x) ≈ [-0.20284] atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin) + fe(State) -x = reg(df, m, weights = :Pop) -@test coef(x) ≈ [-0.20995] atol = 1e-4 -m = @formula Sales ~ NDI + (Price ~ Pimin) -x = reg(df, m) -@test coef(x) ≈ [137.45096580480387,0.005169677634275297,-0.7627670265757879] atol = 1e-4 -m = @formula Sales ~ NDI + (Price ~ Pimin) + fe(State) -x = reg(df, m) -@test coef(x) ≈ [0.0011021722526916768,-0.3216374943695231] atol = 1e-4 - -# non high dimensional factors -m = @formula Sales ~ Price + YearC -x = reg(df, m) -m = @formula Sales ~ YearC + fe(State) -x = reg(df, m) -m = @formula Sales ~ Price + YearC + fe(State) -x = reg(df, m) -@test coef(x)[1] ≈ -1.08471 atol = 1e-4 -m = @formula Sales ~ Price + YearC + fe(State) -x = reg(df, m, weights = :Pop) -@test coef(x)[1] ≈ -0.88794 atol = 1e-4 -m = @formula Sales ~ NDI + (Price ~ Pimin) + YearC + fe(State) -x = reg(df, m) -@test coef(x)[1] ≈ -0.00525 atol = 1e-4 - ############################################################################## ## -## Programming -## -############################################################################## -reg(df, term(:Sales) ~ term(:NDI) + fe(:State) + fe(:Year), Vcov.cluster(:State)) -@test fe(:State) + fe(:Year) === reduce(+, fe.([:State, :Year])) === fe(term(:State)) + fe(term(:Year)) - - -############################################################################## -## -## Functions -## -############################################################################## - -# function -m = @formula Sales ~ log(Price) -x = reg(df, m) -@test coef(x)[1] ≈184.98520688 atol = 1e-4 - -# function defined in user space -mylog(x) = log(x) -m = @formula Sales ~ mylog(Price) -x = reg(df, m) - -# Function returning Inf -df.Price_zero = copy(df.Price) -df.Price_zero[1] = 0.0 -m = @formula Sales ~ log(Price_zero) -@test_throws "Some observations for the regressor are infinite" reg(df, m) -############################################################################## -## -## collinearity -## add more tests +## coefficients ## ############################################################################## -# ols -df.Price2 = df.Price -m = @formula Sales ~ Price + Price2 -x = reg(df, m) -@test coef(x) ≈ [139.7344639806166,-0.22974688593485126,0.0] atol = 1e-4 - +@testset "coefficients" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + df.YearC = categorical(df.Year) + + # simple + m = @formula Sales ~ Price + x = reg(df, m) + @test coef(x) ≈ [139.73446,-0.22974] atol = 1e-4 + m = @formula Sales ~ Price + x = reg(df, m, weights = :Pop) + @test coef(x) ≈ [137.72495428982756,-0.23738] atol = 1e-4 -## iv -df.NDI2 = df.NDI -m = @formula Sales ~ NDI2 + NDI + (Price ~ Pimin) -x = reg(df, m) -@test iszero(coef(x)[2]) || iszero(coef(x)[3]) + df.SalesInt = round.(Int64, df.Sales) + m = @formula SalesInt ~ Price + x = reg(df, m) + @test coef(x) ≈ [139.72674,-0.2296683205] atol = 1e-4 -## endogeneous variables collinear with instruments are reclassified -df.zPimin = df.Pimin -m = @formula Sales ~ zPimin + (Price ~ NDI + Pimin) -x = reg(df, m) + # absorb + m = @formula Sales ~ Price + fe(State) + x = reg(df, m) + @test coef(x) ≈ [-0.20984] atol = 1e-4 + @test x.iterations == 1 -m2 = @formula Sales ~ (zPimin + Price ~ NDI + Pimin) -NDI = reg(df, m2) -@test coefnames(x) == coefnames(NDI) -@test coef(x) ≈ coef(NDI) -@test vcov(x) ≈ vcov(NDI) + m = @formula Sales ~ Price + fe(State) + fe(Year) + x = reg(df, m) + @test coef(x) ≈ [-1.08471] atol = 1e-4 + m = @formula Sales ~ Price + fe(State) + fe(State)*Year + x = reg(df, m) + @test coef(x) ≈ [-0.53470, 0.0] atol = 1e-4 + m = @formula Sales ~ Price + fe(State)*Year + x = reg(df, m) + @test coef(x) ≈ [-0.53470, 0.0] atol = 1e-4 + #@test isempty(coef(reg(df, @formula(Sales ~ 0), @fe(State*Price)))) + df.mState = div.(df.State, 10) + m = @formula Sales ~ Price + fe(mState)&fe(Year) + x = reg(df, m) + @test coef(x) ≈ [-1.44255] atol = 1e-4 -# catch when IV underidentified -@test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(Sales ~ Price + (NDI + Pop ~ NDI))) -@test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(Sales ~ Price + (Pop ~ Price))) + m = @formula Sales ~ Price + fe(State)&Year + x = reg(df, m) + @test coef(x) ≈ [13.993028174622104,-0.5804357763515606] atol = 1e-4 + m = @formula Sales ~ Price + Year&fe(State) + x = reg(df, m) + @test coef(x) ≈ [13.993028174622104,-0.5804357763515606] atol = 1e-4 + m = @formula Sales ~ 1 + Year&fe(State) + x = reg(df, m) + @test coef(x) ≈ [174.4084407796102] atol = 1e-4 + m = @formula Sales ~ Price + fe(State)&Year + fe(Year)&State + x = reg(df, m) + @test coef(x) ≈ [51.2359,- 0.5797] atol = 1e-4 + m = @formula Sales ~ Price + NDI + fe(State)&Year + fe(Year)&State + x = reg(df, m) + @test coef(x) ≈ [-46.4464,-0.2546, -0.005563] atol = 1e-4 + m = @formula Sales ~ 0 + Price + NDI + fe(State)&Year + fe(Year)&State + x = reg(df, m) + @test coef(x) ≈ [-0.21226562244177932,-0.004775616634862829] atol = 1e-4 + # recheck these two below + m = @formula Sales ~ Pimin + Price&NDI&fe(State) + x = reg(df, m) + @test coef(x) ≈ [122.98713, 0.30933] atol = 1e-4 + # SSR does not work well here + m = @formula Sales ~ Pimin + (Price&NDI)*fe(State) + x = reg(df, m) + @test coef(x) ≈ [0.421406, 0.0] atol = 1e-4 + + # only one intercept + m = @formula Sales ~ 1 + fe(State) + fe(Year) + x = reg(df, m) + + + + + + # TO DO: REPORT INTERCEPT IN CASE OF FIXED EFFFECTS, LIKE STATA + df.id3 = categorical(mod.(1:size(df, 1), Ref(3))) + df.id4 = categorical(div.(1:size(df, 1), Ref(10))) + + m = @formula Sales ~ id3 + x = reg(df, m) + @test length(coef(x)) == 3 + + m = @formula Sales ~ 0 + id3 + x = reg(df, m) + @test length(coef(x)) == 3 + + + # with fixed effects it's like implicit intercept + m = @formula Sales ~ id3 + fe(id4) + x = reg(df, m) + @test length(coef(x)) == 2 + + + m = @formula Sales ~ Year + fe(State) + x = reg(df, m) + + + m = @formula Sales ~ id3&Price + x = reg(df, m) + @test length(coef(x)) == 4 + m = @formula Sales ~ id3&Price + Price + x = reg(df, m) + @test length(coef(x)) == 4 + + + + + + m = @formula Sales ~ Year + fe(State) + x = reg(df, m) + + + + m = @formula Sales ~ Year&Price + fe(State) + x = reg(df, m) + + + + + # absorb + weights + m = @formula Sales ~ Price + fe(State) + x = reg(df, m, weights = :Pop) + @test coef(x) ≈ [- 0.21741] atol = 1e-4 + m = @formula Sales ~ Price + fe(State) + fe(Year) + x = reg(df, m, weights = :Pop) + @test coef(x) ≈ [- 0.88794] atol = 1e-3 + m = @formula Sales ~ Price + fe(State) + fe(State)&Year + x = reg(df, m, weights = :Pop) + @test coef(x) ≈ [- 0.461085492] atol = 1e-4 + + # iv + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m) + @test coef(x) ≈ [138.19479,- 0.20733] atol = 1e-4 + m = @formula Sales ~ NDI + (Price ~ Pimin) + x = reg(df, m) + @test coef(x) ≈ [137.45096,0.00516,- 0.76276] atol = 1e-4 + m = @formula Sales ~ NDI + (Price ~ Pimin + Pop) + x = reg(df, m) + @test coef(x) ≈ [137.57335,0.00534,- 0.78365] atol = 1e-4 + ## multiple endogeneous variables + m = @formula Sales ~ (Price + NDI ~ Pimin + Pop) + x = reg(df, m) + @test coef(x) ≈ [139.544, .8001, -.00937] atol = 1e-4 + m = @formula Sales ~ 1 + (Price + NDI ~ Pimin + Pop) + x = reg(df, m) + @test coef(x) ≈ [139.544, .8001, -.00937] atol = 1e-4 + result = [196.576, 0.00490989, -2.94019, -3.00686, -2.94903, -2.80183, -2.74789, -2.66682, -2.63855, -2.52394, -2.34751, -2.19241, -2.18707, -2.09244, -1.9691, -1.80463, -1.81865, -1.70428, -1.72925, -1.68501, -1.66007, -1.56102, -1.43582, -1.36812, -1.33677, -1.30426, -1.28094, -1.25175, -1.21438, -1.16668, -1.13033, -1.03782] + + m = @formula Sales ~ NDI + (Price&YearC ~ Pimin&YearC) + x = reg(df, m) + @test coef(x) ≈ result atol = 1e-4 + + # iv + weight + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m, weights = :Pop) + @test coef(x) ≈ [137.03637,- 0.22802] atol = 1e-4 + # iv + weight + absorb + m = @formula Sales ~ (Price ~ Pimin) + fe(State) + x = reg(df, m) + @test coef(x) ≈ [-0.20284] atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin) + fe(State) + x = reg(df, m, weights = :Pop) + @test coef(x) ≈ [-0.20995] atol = 1e-4 + m = @formula Sales ~ NDI + (Price ~ Pimin) + x = reg(df, m) + @test coef(x) ≈ [137.45096580480387,0.005169677634275297,-0.7627670265757879] atol = 1e-4 + m = @formula Sales ~ NDI + (Price ~ Pimin) + fe(State) + x = reg(df, m) + @test coef(x) ≈ [0.0011021722526916768,-0.3216374943695231] atol = 1e-4 -# catch when IV underidentified -@test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(Sales ~ Price + (NDI + Pop ~ NDI))) - - -# Make sure all coefficients are estimated -p = [100.0, -40.0, 30.0, 20.0] -df_r = DataFrame(y = p, x = p.^4) -result = reg(df_r, @formula(y ~ x)) -@test sum(abs.(coef(result)) .> 0) == 2 - -############################################################################## -## -## std errors -## -############################################################################## - -# Simple - matches stata -m = @formula Sales ~ Price -x = reg(df, m) -@test stderror(x) ≈ [1.521269, 0.0188963] atol = 1e-6 -# Stata ivreg - ivreg2 discrepancy - matches with df_add=-2 -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m) -@test stderror(x) ≈ [1.53661, 0.01915] atol = 1e-4 -# Stata areg -m = @formula Sales ~ Price + fe(State) -x = reg(df, m) -@test stderror(x) ≈ [0.0098003] atol = 1e-7 - -# White -# Stata reg - matches stata -m = @formula Sales ~ Price -x = reg(df, m, Vcov.robust()) -@test stderror(x) ≈ [1.686791, 0.0167042] atol = 1e-6 -# Stata ivreg - ivreg2 discrepancy - matches with df_add=-2 -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m, Vcov.robust()) -@test stderror(x) ≈ [1.63305, 0.01674] atol = 1e-4 -# Stata areg - matches areg -m = @formula Sales ~ Price + fe(State) -x = reg(df, m, Vcov.robust()) -@test stderror(x) ≈ [0.0110005] atol = 1e-7 - -# Clustering models -# cluster - matches stata -m = @formula Sales ~ Price -x = reg(df, m, Vcov.cluster(:State)) -@test stderror(x)[2] ≈ 0.0379228 atol = 1e-7 -# cluster with fe - matches areg & reghdfe -m = @formula Sales ~ Price + fe(State) -x = reg(df, m, Vcov.cluster(:Year)) -@test stderror(x) ≈ [0.0220563] atol = 1e-7 -# stata reghxe - matches reghdfe (not areg) -m = @formula Sales ~ Price + fe(State) -x = reg(df, m, Vcov.cluster(:State)) -@test stderror(x) ≈ [0.0357498] atol = 1e-7 -# iv + fe + cluster - matches ivreghdfe -m = @formula Sales ~ NDI + (Price ~Pimin) + fe(State) -x = reg(df, m, Vcov.cluster(:State)) -@test stderror(x) ≈ [0.0019704, 0.1893396] atol = 1e-7 -# iv + fe + cluster + weights - matches ivreghdfe -m = @formula Sales ~ NDI + (Price ~ Pimin) + fe(State) -x = reg(df, m, Vcov.cluster(:State), weights = :Pop) -@test stderror(x) ≈ [0.000759, 0.070836] atol = 1e-6 -# iv + fe + cluster + weights - matches ivreghdfe -m = @formula Sales ~ (Price ~ Pimin) + fe(State) -x = reg(df, m, Vcov.cluster(:State), weights = :Pop) -@test stderror(x) ≈ [0.0337439] atol = 1e-7 -# multiway clustering - matches reghdfe -m = @formula Sales ~ Price -x = reg(df, m, Vcov.cluster(:State, :Year)) -@test stderror(x) ≈ [6.196362, 0.0403469] atol = 1e-6 -# multiway clustering - matches reghdfe -m = @formula Sales ~ Price -x = reg(df, m, Vcov.cluster(:State, :Year)) -@test stderror(x) ≈ [6.196362, 0.0403469] atol = 1e-6 -# fe + multiway clustering - matches reghdfe -m = @formula Sales ~ Price + fe(State) -x = reg(df, m, Vcov.cluster(:State, :Year)) -@test stderror(x) ≈ [0.0405335] atol = 1e-7 -m = @formula Sales ~ Price + fe(State) -x = reg(df, m, Vcov.cluster(:State, :Year)) -@test stderror(x) ≈ [0.0405335] atol = 1e-7 -# fe + clustering on interactions - matches reghdfe -#m = @formula Sales ~ Price + fe(State) vcov = cluster(State&id2) -#x = reg(df, m) -#@test stderror(x) ≈ [0.0110005] atol = 1e-7 -# fe partially nested in interaction clusters - matches reghdfe -#m=@formula Sales ~ Price + fe(State) vcov=cluster(State&id2) -#x = reg(df, m) -#@test stderror(x) ≈ [0.0110005] atol = 1e-7 -# regressor partially nested in interaction clusters - matches reghdfe -#m=@formula Sales ~ Price + StateC vcov=cluster(State&id2) -#x = reg(df, m) -#@test stderror(x)[1:2] ≈ [3.032187, 0.0110005] atol=1e-5 - -#check palue printed is correct -#m = @formula Sales ~ Price + fe(State) -#x = reg(df, m, Vcov.cluster(:State)) -#@test coeftable(x).mat[1, 4] ≈ 4.872723900371927e-7 atol = 1e-7 - - -############################################################################## -## -## subset -## -############################################################################## -m = @formula Sales ~ Price + StateC -x0 = reg(df[df.State .<= 30, :], m) + # non high dimensional factors + m = @formula Sales ~ Price + YearC + x = reg(df, m) + m = @formula Sales ~ YearC + fe(State) + x = reg(df, m) + m = @formula Sales ~ Price + YearC + fe(State) + x = reg(df, m) + @test coef(x)[1] ≈ -1.08471 atol = 1e-4 + m = @formula Sales ~ Price + YearC + fe(State) + x = reg(df, m, weights = :Pop) + @test coef(x)[1] ≈ -0.88794 atol = 1e-4 + m = @formula Sales ~ NDI + (Price ~ Pimin) + YearC + fe(State) + x = reg(df, m) + @test coef(x)[1] ≈ -0.00525 atol = 1e-4 + + + ############################################################################## + ## + ## Programming + ## + ############################################################################## + reg(df, term(:Sales) ~ term(:NDI) + fe(:State) + fe(:Year), Vcov.cluster(:State)) + @test fe(:State) + fe(:Year) === reduce(+, fe.([:State, :Year])) === fe(term(:State)) + fe(term(:Year)) + + + ############################################################################## + ## + ## Functions + ## + ############################################################################## + + # function + m = @formula Sales ~ log(Price) + x = reg(df, m) + @test coef(x)[1] ≈184.98520688 atol = 1e-4 + + # function defined in user space + mylog(x) = log(x) + m = @formula Sales ~ mylog(Price) + x = reg(df, m) + + # Function returning Inf + df.Price_zero = copy(df.Price) + df.Price_zero[1] = 0.0 + m = @formula Sales ~ log(Price_zero) + @test_throws "Some observations for the regressor are infinite" reg(df, m) + ############################################################################## + ## + ## collinearity + ## add more tests + ## + ############################################################################## + # ols + df.Price2 = df.Price + m = @formula Sales ~ Price + Price2 + x = reg(df, m) + @test coef(x) ≈ [139.7344639806166,-0.22974688593485126,0.0] atol = 1e-4 + + + ## iv + df.NDI2 = df.NDI + m = @formula Sales ~ NDI2 + NDI + (Price ~ Pimin) + x = reg(df, m) + @test iszero(coef(x)[2]) || iszero(coef(x)[3]) + + + ## endogeneous variables collinear with instruments are reclassified + df.zPimin = df.Pimin + m = @formula Sales ~ zPimin + (Price ~ NDI + Pimin) + x = reg(df, m) + + m2 = @formula Sales ~ (zPimin + Price ~ NDI + Pimin) + NDI = reg(df, m2) + @test coefnames(x) == coefnames(NDI) + @test coef(x) ≈ coef(NDI) + @test vcov(x) ≈ vcov(NDI) + + + # catch when IV underidentified + @test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(Sales ~ Price + (NDI + Pop ~ NDI))) + @test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(Sales ~ Price + (Pop ~ Price))) + + + + + + # catch when IV underidentified + @test_throws "Model not identified. There must be at least as many ivs as endogeneneous variables" reg(df, @formula(Sales ~ Price + (NDI + Pop ~ NDI))) + + + # Make sure all coefficients are estimated + p = [100.0, -40.0, 30.0, 20.0] + df_r = DataFrame(y = p, x = p.^4) + result = reg(df_r, @formula(y ~ x)) + @test sum(abs.(coef(result)) .> 0) == 2 +end + ############################################################################## + ## + ## std errors + ## + ############################################################################## +@testset "standard errors" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + df.YearC = categorical(df.Year) + + # Simple - matches stata + m = @formula Sales ~ Price + x = reg(df, m) + @test stderror(x) ≈ [1.521269, 0.0188963] atol = 1e-6 + # Stata ivreg - ivreg2 discrepancy - matches with df_add=-2 + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m) + @test stderror(x) ≈ [1.53661, 0.01915] atol = 1e-4 + # Stata areg + m = @formula Sales ~ Price + fe(State) + x = reg(df, m) + @test stderror(x) ≈ [0.0098003] atol = 1e-7 + + # White + # Stata reg - matches stata + m = @formula Sales ~ Price + x = reg(df, m, Vcov.robust()) + @test stderror(x) ≈ [1.686791, 0.0167042] atol = 1e-6 + # Stata ivreg - ivreg2 discrepancy - matches with df_add=-2 + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m, Vcov.robust()) + @test stderror(x) ≈ [1.63305, 0.01674] atol = 1e-4 + # Stata areg - matches areg + m = @formula Sales ~ Price + fe(State) + x = reg(df, m, Vcov.robust()) + @test stderror(x) ≈ [0.0110005] atol = 1e-7 + + # Clustering models + # cluster - matches stata + m = @formula Sales ~ Price + x = reg(df, m, Vcov.cluster(:State)) + @test stderror(x)[2] ≈ 0.0379228 atol = 1e-7 + # cluster with fe - matches areg & reghdfe + m = @formula Sales ~ Price + fe(State) + x = reg(df, m, Vcov.cluster(:Year)) + @test stderror(x) ≈ [0.0220563] atol = 1e-7 + # stata reghxe - matches reghdfe (not areg) + m = @formula Sales ~ Price + fe(State) + x = reg(df, m, Vcov.cluster(:State)) + @test stderror(x) ≈ [0.0357498] atol = 1e-7 + # iv + fe + cluster - matches ivreghdfe + m = @formula Sales ~ NDI + (Price ~Pimin) + fe(State) + x = reg(df, m, Vcov.cluster(:State)) + @test stderror(x) ≈ [0.0019704, 0.1893396] atol = 1e-7 + # iv + fe + cluster + weights - matches ivreghdfe + m = @formula Sales ~ NDI + (Price ~ Pimin) + fe(State) + x = reg(df, m, Vcov.cluster(:State), weights = :Pop) + @test stderror(x) ≈ [0.000759, 0.070836] atol = 1e-6 + # iv + fe + cluster + weights - matches ivreghdfe + m = @formula Sales ~ (Price ~ Pimin) + fe(State) + x = reg(df, m, Vcov.cluster(:State), weights = :Pop) + @test stderror(x) ≈ [0.0337439] atol = 1e-7 + # multiway clustering - matches reghdfe + m = @formula Sales ~ Price + x = reg(df, m, Vcov.cluster(:State, :Year)) + @test stderror(x) ≈ [6.196362, 0.0403469] atol = 1e-6 + # multiway clustering - matches reghdfe + m = @formula Sales ~ Price + x = reg(df, m, Vcov.cluster(:State, :Year)) + @test stderror(x) ≈ [6.196362, 0.0403469] atol = 1e-6 + # fe + multiway clustering - matches reghdfe + m = @formula Sales ~ Price + fe(State) + x = reg(df, m, Vcov.cluster(:State, :Year)) + @test stderror(x) ≈ [0.0405335] atol = 1e-7 + m = @formula Sales ~ Price + fe(State) + x = reg(df, m, Vcov.cluster(:State, :Year)) + @test stderror(x) ≈ [0.0405335] atol = 1e-7 + # fe + clustering on interactions - matches reghdfe + #m = @formula Sales ~ Price + fe(State) vcov = cluster(State&id2) + #x = reg(df, m) + #@test stderror(x) ≈ [0.0110005] atol = 1e-7 + # fe partially nested in interaction clusters - matches reghdfe + #m=@formula Sales ~ Price + fe(State) vcov=cluster(State&id2) + #x = reg(df, m) + #@test stderror(x) ≈ [0.0110005] atol = 1e-7 + # regressor partially nested in interaction clusters - matches reghdfe + #m=@formula Sales ~ Price + StateC vcov=cluster(State&id2) + #x = reg(df, m) + #@test stderror(x)[1:2] ≈ [3.032187, 0.0110005] atol=1e-5 + + #check palue printed is correct + #m = @formula Sales ~ Price + fe(State) + #x = reg(df, m, Vcov.cluster(:State)) + #@test coeftable(x).mat[1, 4] ≈ 4.872723900371927e-7 atol = 1e-7 +end -m = @formula Sales ~ Price + StateC -Price = reg(df, m, subset = df.State .<= 30) -@test length(Price.esample) == size(df, 1) -@test coef(x0) ≈ coef(Price) atol = 1e-4 -@test vcov(x0) ≈ vcov(Price) atol = 1e-4 + ############################################################################## + ## + ## subset + ## + ############################################################################## +@testset "subset" begin + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + df.YearC = categorical(df.Year) -df.State_missing = ifelse.(df.State .<= 30, df.State, missing) -df.StateC_missing = categorical(df.State_missing) -m = @formula Sales ~ Price + StateC_missing -NDI = reg(df, m) -@test length(NDI.esample) == size(df, 1) -@test coef(x0) ≈ coef(NDI) atol = 1e-4 -@test vcov(x0) ≈ vcov(NDI) atol = 1e-2 + m = @formula Sales ~ Price + StateC + x0 = reg(df[df.State .<= 30, :], m) -# missing weights -df.Price_missing = ifelse.(df.State .<= 30, df.Price, missing) -m = @formula Sales ~ NDI + StateC -x = reg(df, m, weights = :Price_missing) -@test length(x.esample) == size(df, 1) + m = @formula Sales ~ Price + StateC + Price = reg(df, m, subset = df.State .<= 30) + @test length(Price.esample) == size(df, 1) + @test coef(x0) ≈ coef(Price) atol = 1e-4 + @test vcov(x0) ≈ vcov(Price) atol = 1e-4 -# missing interaction -m = @formula Sales ~ NDI + fe(State)&Price_missing -x = reg(df, m) -@test nobs(x) == count(.!ismissing.(df.Price_missing)) + df.State_missing = ifelse.(df.State .<= 30, df.State, missing) + df.StateC_missing = categorical(df.State_missing) + m = @formula Sales ~ Price + StateC_missing + NDI = reg(df, m) + @test length(NDI.esample) == size(df, 1) + @test coef(x0) ≈ coef(NDI) atol = 1e-4 + @test vcov(x0) ≈ vcov(NDI) atol = 1e-2 + # missing weights + df.Price_missing = ifelse.(df.State .<= 30, df.Price, missing) + m = @formula Sales ~ NDI + StateC + x = reg(df, m, weights = :Price_missing) + @test length(x.esample) == size(df, 1) -m = @formula Sales ~ Price + fe(State) -x3 = reg(df, m, subset = df.State .<= 30) -@test length(x3.esample) == size(df, 1) -@test coef(x0)[2] ≈ coef(x3)[1] atol = 1e-4 -m = @formula Sales ~ Price + fe(State_missing) -x4 = reg(df, m) -@test coef(x0)[2] ≈ coef(x4)[1] atol = 1e-4 + # missing interaction + m = @formula Sales ~ NDI + fe(State)&Price_missing + x = reg(df, m) + @test nobs(x) == count(.!ismissing.(df.Price_missing)) + m = @formula Sales ~ Price + fe(State) + x3 = reg(df, m, subset = df.State .<= 30) + @test length(x3.esample) == size(df, 1) + @test coef(x0)[2] ≈ coef(x3)[1] atol = 1e-4 -# categorical variable as fixed effects -m = @formula Sales ~ Price + fe(State) -x5 = reg(df, m, subset = df.State .>= 30) + m = @formula Sales ~ Price + fe(State_missing) + x4 = reg(df, m) + @test coef(x0)[2] ≈ coef(x4)[1] atol = 1e-4 -#Error reported by Erik -m = @formula Sales ~ Pimin + CPI -x = reg(df, m, Vcov.cluster(:State), subset = df.State .>= 30) -@test diag(x.vcov) ≈ [130.7464887, 0.0257875, 0.0383939] atol = 1e-4 + # categorical variable as fixed effects + m = @formula Sales ~ Price + fe(State) + x5 = reg(df, m, subset = df.State .>= 30) -############################################################################## -## -## R2 -## -############################################################################## -m = @formula Sales ~ Price -x = reg(df, m) -@test r2(x) ≈ 0.0969 atol = 1e-4 -@test adjr2(x) ≈ 0.09622618 atol = 1e-4 + #Error reported by Erik + m = @formula Sales ~ Pimin + CPI + x = reg(df, m, Vcov.cluster(:State), subset = df.State .>= 30) + @test diag(x.vcov) ≈ [130.7464887, 0.0257875, 0.0383939] atol = 1e-4 +end -############################################################################## -## -## F Stat -## -############################################################################## -m = @formula Sales ~ Price -x = reg(df, m) -@test x.F ≈ 147.82425 atol = 1e-4 -m = @formula Sales ~ Price + fe(State) -x = reg(df, m) -@test x.F ≈ 458.45825 atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m) -@test x.F ≈ 117.17329 atol = 1e-4 -m = @formula Sales ~ Price + Pop -x = reg(df, m) -@test x.F ≈ 79.091576 atol = 1e-4 -m = @formula Sales ~ Price -x = reg(df, m, Vcov.cluster(:State)) -@test x.F ≈ 36.70275 atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m, Vcov.cluster(:State)) -@test x.F ≈ 39.67227 atol = 1e-4 -# xtivreg2 -m = @formula Sales ~ (Price ~ Pimin) + fe(State) -x = reg(df, m) -@test x.F ≈ 422.46444 atol = 1e-4 - -# p value -m = @formula Pop ~ Pimin -x = reg(df, m) -@test x.p ≈ 0.003998718283554043 atol = 1e-4 -m = @formula Pop ~ Pimin + Price -x = reg(df, m) -@test x.p ≈ 3.369423076033613e-14 atol = 1e-4 - - - - -# Fstat https://github.com/FixedEffects/FixedEffectModels.jl/issues/150 -df_example = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Ftest.csv"))) -x = reg(df_example, @formula(Y ~ fe(id) + (X1 + X2 ~ Z1 + Z2) ), Vcov.robust() ) -@test x.F_kp ≈ 14.5348449357 atol = 1e-4 +@testset "statistics" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + df.YearC = categorical(df.Year) + ############################################################################## + ## + ## R2 + ## + ############################################################################## + m = @formula Sales ~ Price + x = reg(df, m) + @test r2(x) ≈ 0.0969 atol = 1e-4 + @test adjr2(x) ≈ 0.09622618 atol = 1e-4 + + + ############################################################################## + ## + ## F Stat + ## + ############################################################################## + m = @formula Sales ~ Price + x = reg(df, m) + @test x.F ≈ 147.82425 atol = 1e-4 + m = @formula Sales ~ Price + fe(State) + x = reg(df, m) + @test x.F ≈ 458.45825 atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m) + @test x.F ≈ 117.17329 atol = 1e-4 + m = @formula Sales ~ Price + Pop + x = reg(df, m) + @test x.F ≈ 79.091576 atol = 1e-4 + m = @formula Sales ~ Price + x = reg(df, m, Vcov.cluster(:State)) + @test x.F ≈ 36.70275 atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m, Vcov.cluster(:State)) + @test x.F ≈ 39.67227 atol = 1e-4 + # xtivreg2 + m = @formula Sales ~ (Price ~ Pimin) + fe(State) + x = reg(df, m) + @test x.F ≈ 422.46444 atol = 1e-4 + + # p value + m = @formula Pop ~ Pimin + x = reg(df, m) + @test x.p ≈ 0.003998718283554043 atol = 1e-4 + m = @formula Pop ~ Pimin + Price + x = reg(df, m) + @test x.p ≈ 3.369423076033613e-14 atol = 1e-4 + + + + + # Fstat https://github.com/FixedEffects/FixedEffectModels.jl/issues/150 + df_example = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Ftest.csv"))) + x = reg(df_example, @formula(Y ~ fe(id) + (X1 + X2 ~ Z1 + Z2) ), Vcov.robust() ) + @test x.F_kp ≈ 14.5348449357 atol = 1e-4 + + + ############################################################################## + ## + ## F_kp r_kp statistics for IV. Difference degrees of freedom. + ## + ############################################################################## + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m) + @test x.F_kp ≈ 52248.79247 atol = 1e-4 + m = @formula Sales ~ NDI + (Price ~ Pimin) + x = reg(df, m) + @test x.F_kp ≈ 5159.812208193612 atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin + CPI) + x = reg(df, m) + @test x.F_kp ≈ 27011.44080 atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin) + fe(State) + x = reg(df, m) + @test x.F_kp ≈ 100927.75710 atol = 1e-4 + + # exactly same with ivreg2 + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m, Vcov.robust()) + @test x.F_kp ≈ 23160.06350 atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin) + fe(State) + x = reg(df, m, Vcov.robust()) + @test x.F_kp ≈ 37662.82808 atol = 1e-4 + m = @formula Sales ~ CPI + (Price ~ Pimin) + x = reg(df, m, Vcov.robust()) + @test x.F_kp ≈ 2093.46609 atol = 1e-4 + m = @formula Sales ~ (Price ~ Pimin + CPI) + x = reg(df, m, Vcov.robust()) + @test x.F_kp ≈ 16418.21196 atol = 1e-4 + + + + # like in ivreg2 but += 5 difference. combination iv difference, degrees of freedom difference? + # iv + cluster - F_kp varies from ivreg2 about 7e-4 (SEs off by more than 2 df) + m = @formula Sales ~ (Price ~ Pimin) + x = reg(df, m, Vcov.cluster(:State)) + @test x.F_kp ≈ 7249.88606 atol = 1e-4 + # iv + cluster - F_kp varies from ivreg2 about 5e-6 (SEs off by more than 2 df) + m = @formula Sales ~ CPI + (Price ~ Pimin) + x = reg(df, m, Vcov.cluster(:State)) + @test x.F_kp ≈ 538.40393 atol = 1e-4 + # Modified test values below after multiway clustering update + # iv + 2way clustering - F_kp matches ivreg2 (SEs match with df_add=-2) + m = @formula Sales ~ CPI + (Price ~ Pimin) + x = reg(df, m, Vcov.cluster(:State, :Year)) + @test x.F_kp ≈ 421.9651 atol = 1e-4 + # multivariate iv + clustering - F_kp varies from ivreg2 about 3 (SEs off by more than 2 df) + m = @formula Sales ~ (Price ~ Pimin + CPI) + x = reg(df, m, Vcov.cluster(:State)) + @test x.F_kp ≈ 4080.66081 atol = 1e-4 + # multivariate iv + multiway clustering - F_kp varies from ivreg2 about 2 (SEs off by more than 2 df) + m = @formula Sales ~ (Price ~ Pimin + CPI) + x = reg(df, m, Vcov.cluster(:State, :Year)) + @test x.F_kp ≈ 2873.1405 atol = 1e-4 +end -############################################################################## -## -## F_kp r_kp statistics for IV. Difference degrees of freedom. -## -############################################################################## -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m) -@test x.F_kp ≈ 52248.79247 atol = 1e-4 -m = @formula Sales ~ NDI + (Price ~ Pimin) -x = reg(df, m) -@test x.F_kp ≈ 5159.812208193612 atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin + CPI) -x = reg(df, m) -@test x.F_kp ≈ 27011.44080 atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin) + fe(State) -x = reg(df, m) -@test x.F_kp ≈ 100927.75710 atol = 1e-4 - -# exactly same with ivreg2 -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m, Vcov.robust()) -@test x.F_kp ≈ 23160.06350 atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin) + fe(State) -x = reg(df, m, Vcov.robust()) -@test x.F_kp ≈ 37662.82808 atol = 1e-4 -m = @formula Sales ~ CPI + (Price ~ Pimin) -x = reg(df, m, Vcov.robust()) -@test x.F_kp ≈ 2093.46609 atol = 1e-4 -m = @formula Sales ~ (Price ~ Pimin + CPI) -x = reg(df, m, Vcov.robust()) -@test x.F_kp ≈ 16418.21196 atol = 1e-4 - - - -# like in ivreg2 but += 5 difference. combination iv difference, degrees of freedom difference? -# iv + cluster - F_kp varies from ivreg2 about 7e-4 (SEs off by more than 2 df) -m = @formula Sales ~ (Price ~ Pimin) -x = reg(df, m, Vcov.cluster(:State)) -@test x.F_kp ≈ 7249.88606 atol = 1e-4 -# iv + cluster - F_kp varies from ivreg2 about 5e-6 (SEs off by more than 2 df) -m = @formula Sales ~ CPI + (Price ~ Pimin) -x = reg(df, m, Vcov.cluster(:State)) -@test x.F_kp ≈ 538.40393 atol = 1e-4 -# Modified test values below after multiway clustering update -# iv + 2way clustering - F_kp matches ivreg2 (SEs match with df_add=-2) -m = @formula Sales ~ CPI + (Price ~ Pimin) -x = reg(df, m, Vcov.cluster(:State, :Year)) -@test x.F_kp ≈ 421.9651 atol = 1e-4 -# multivariate iv + clustering - F_kp varies from ivreg2 about 3 (SEs off by more than 2 df) -m = @formula Sales ~ (Price ~ Pimin + CPI) -x = reg(df, m, Vcov.cluster(:State)) -@test x.F_kp ≈ 4080.66081 atol = 1e-4 -# multivariate iv + multiway clustering - F_kp varies from ivreg2 about 2 (SEs off by more than 2 df) -m = @formula Sales ~ (Price ~ Pimin + CPI) -x = reg(df, m, Vcov.cluster(:State, :Year)) -@test x.F_kp ≈ 2873.1405 atol = 1e-4 +@testset "singletons" begin + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + df.YearC = categorical(df.Year) -############################################################################## -## -## Test singleton -## -## -############################################################################## -df.n = max.(1:size(df, 1), 60) -df.pn = categorical(df.n) -m = @formula Sales ~ Price + fe(pn) -x = reg(df, m, Vcov.cluster(:State)) -@test x.nobs == 60 + df.n = max.(1:size(df, 1), 60) + df.pn = categorical(df.n) + m = @formula Sales ~ Price + fe(pn) + x = reg(df, m, Vcov.cluster(:State)) + @test x.nobs == 60 -m = @formula Sales ~ Price + fe(pn) -x = reg(df, m, Vcov.cluster(:State), drop_singletons = false) -@test x.nobs == 1380 -############################################################################## -## -## Test unbalanced panel -## -## corresponds to abdata in Stata, for instance reghxe wage emp [w=indoutpt], a(id year) -## -############################################################################## -df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/EmplUK.csv"))) - -m = @formula Wage ~ Emp + fe(Firm) -x = reg(df, m) -@test coef(x) ≈ [- 0.11981270017206136] atol = 1e-4 -m = @formula Wage ~ Emp + fe(Firm)&Year -x = reg(df, m) -@test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 -m = @formula Wage ~ Emp + Year&fe(Firm) -x = reg(df, m) -@test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 -m = @formula Wage ~ 1 + Year&fe(Firm) -x = reg(df, m) -@test coef(x) ≈ [- 356.40430526316396] atol = 1e-4 -m = @formula Wage ~ Emp + fe(Firm) -x = reg(df, m, weights = :Output) -@test coef(x) ≈ [- 0.11514363590574725] atol = 1e-4 - -# absorb + weights -m = @formula Wage ~ Emp + fe(Firm) + fe(Year) -x = reg(df, m) -@test coef(x) ≈ [- 0.04683333721137311] atol = 1e-4 -m = @formula Wage ~ Emp + fe(Firm) + fe(Year) -x = reg(df, m, weights = :Output) -@test coef(x) ≈ [- 0.043475472188120416] atol = 1e-3 - -## the last two ones test an ill conditioned model matrix -# SSR does not work well here -m = @formula Wage ~ Emp + fe(Firm) + fe(Firm)&Year -x = reg(df, m) -@test coef(x) ≈ [- 0.122354] atol = 1e-4 -@test x.iterations <= 30 - -# SSR does not work well here -m = @formula Wage ~ Emp + fe(Firm) + fe(Firm)&Year -x = reg(df, m, weights = :Output) -@test coef(x) ≈ [- 0.11752306001586807] atol = 1e-4 -@test x.iterations <= 50 - -methods_vec = [:cpu] -if FixedEffectModels.FixedEffects.has_CUDA() - push!(methods_vec, :gpu) + m = @formula Sales ~ Price + fe(pn) + x = reg(df, m, Vcov.cluster(:State), drop_singletons = false) + @test x.nobs == 1380 end -for method in methods_vec - # same thing with float32 precision - local m = @formula Wage ~ Emp + fe(Firm) - local x = reg(df, m, method = method, double_precision = false) - @test coef(x) ≈ [- 0.11981270017206136] rtol = 1e-4 - local m = @formula Wage ~ Emp + fe(Firm)&Year - local x = reg(df, m, method = method, double_precision = false) - @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] rtol = 1e-4 - local m = @formula Wage ~ Emp + Year&fe(Firm) - local x = reg(df, m, method = method, double_precision = false) - @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] rtol = 1e-4 - local m = @formula Wage ~ 1 + Year&fe(Firm) - local x = reg(df, m, method = method, double_precision = false) - @test coef(x) ≈ [- 356.40430526316396] rtol = 1e-4 - local m = @formula Wage ~ Emp + fe(Firm) - local x = reg(df, m, weights = :Output, method = method, double_precision = false) - @test coef(x) ≈ [- 0.11514363590574725] rtol = 1e-4 - local m = @formula Wage ~ Emp + fe(Firm) + fe(Year) - local x = reg(df, m, method = method, double_precision = false) - @test coef(x) ≈ [- 0.04683333721137311] rtol = 1e-4 - local m = @formula Wage ~ Emp + fe(Firm) + fe(Year) - local x = reg(df, m, weights = :Output, method = method, double_precision = false) - @test coef(x) ≈ [- 0.043475472188120416] atol = 1e-3 -end - -# add tests with missing fixed effects -df.Firm_missing = ifelse.(df.Firm .<= 30, missing, df.Firm) -## test with missing fixed effects -m = @formula Wage ~ Emp + fe(Firm_missing) -x = reg(df, m) -@test coef(x) ≈ [-.1093657] atol = 1e-4 -@test stderror(x) ≈ [.032949 ] atol = 1e-4 -@test r2(x) ≈ 0.8703 atol = 1e-2 -@test adjr2(x) ≈ 0.8502 atol = 1e-2 -@test x.nobs == 821 - -## test with missing interaction -df.Year2 = df.Year .>= 1980 -m = @formula Wage ~ Emp + fe(Firm_missing) & fe(Year2) -x = reg(df, m) -@test coef(x) ≈ [-0.100863] atol = 1e-4 -@test stderror(x) ≈ [0.04149] atol = 1e-4 -@test x.nobs == 821 - - -############################################################################## -## -## Missing dependent / independent variables -## -############################################################################## +@testset "unbalanced panel" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/EmplUK.csv"))) + + m = @formula Wage ~ Emp + fe(Firm) + x = reg(df, m) + @test coef(x) ≈ [- 0.11981270017206136] atol = 1e-4 + m = @formula Wage ~ Emp + fe(Firm)&Year + x = reg(df, m) + @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 + m = @formula Wage ~ Emp + Year&fe(Firm) + x = reg(df, m) + @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] atol = 1e-4 + m = @formula Wage ~ 1 + Year&fe(Firm) + x = reg(df, m) + @test coef(x) ≈ [- 356.40430526316396] atol = 1e-4 + m = @formula Wage ~ Emp + fe(Firm) + x = reg(df, m, weights = :Output) + @test coef(x) ≈ [- 0.11514363590574725] atol = 1e-4 + + # absorb + weights + m = @formula Wage ~ Emp + fe(Firm) + fe(Year) + x = reg(df, m) + @test coef(x) ≈ [- 0.04683333721137311] atol = 1e-4 + m = @formula Wage ~ Emp + fe(Firm) + fe(Year) + x = reg(df, m, weights = :Output) + @test coef(x) ≈ [- 0.043475472188120416] atol = 1e-3 + ## the last two ones test an ill conditioned model matrix + # SSR does not work well here + m = @formula Wage ~ Emp + fe(Firm) + fe(Firm)&Year + x = reg(df, m) + @test coef(x) ≈ [- 0.122354] atol = 1e-4 + @test x.iterations <= 30 + + # SSR does not work well here + m = @formula Wage ~ Emp + fe(Firm) + fe(Firm)&Year + x = reg(df, m, weights = :Output) + @test coef(x) ≈ [- 0.11752306001586807] atol = 1e-4 + @test x.iterations <= 50 + + methods_vec = [:cpu] + if FixedEffectModels.FixedEffects.has_CUDA() + push!(methods_vec, :gpu) + end + for method in methods_vec + # same thing with float32 precision + local m = @formula Wage ~ Emp + fe(Firm) + local x = reg(df, m, method = method, double_precision = false) + @test coef(x) ≈ [- 0.11981270017206136] rtol = 1e-4 + local m = @formula Wage ~ Emp + fe(Firm)&Year + local x = reg(df, m, method = method, double_precision = false) + @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] rtol = 1e-4 + local m = @formula Wage ~ Emp + Year&fe(Firm) + local x = reg(df, m, method = method, double_precision = false) + @test coef(x) ≈ [-315.0000747500431,- 0.07633636891202833] rtol = 1e-4 + local m = @formula Wage ~ 1 + Year&fe(Firm) + local x = reg(df, m, method = method, double_precision = false) + @test coef(x) ≈ [- 356.40430526316396] rtol = 1e-4 + local m = @formula Wage ~ Emp + fe(Firm) + local x = reg(df, m, weights = :Output, method = method, double_precision = false) + @test coef(x) ≈ [- 0.11514363590574725] rtol = 1e-4 + local m = @formula Wage ~ Emp + fe(Firm) + fe(Year) + local x = reg(df, m, method = method, double_precision = false) + @test coef(x) ≈ [- 0.04683333721137311] rtol = 1e-4 + local m = @formula Wage ~ Emp + fe(Firm) + fe(Year) + local x = reg(df, m, weights = :Output, method = method, double_precision = false) + @test coef(x) ≈ [- 0.043475472188120416] atol = 1e-3 + end + + + # add tests with missing fixed effects + df.Firm_missing = ifelse.(df.Firm .<= 30, missing, df.Firm) + + + ## test with missing fixed effects + m = @formula Wage ~ Emp + fe(Firm_missing) + x = reg(df, m) + @test coef(x) ≈ [-.1093657] atol = 1e-4 + @test stderror(x) ≈ [.032949 ] atol = 1e-4 + @test r2(x) ≈ 0.8703 atol = 1e-2 + @test adjr2(x) ≈ 0.8502 atol = 1e-2 + @test x.nobs == 821 + + ## test with missing interaction + df.Year2 = df.Year .>= 1980 + m = @formula Wage ~ Emp + fe(Firm_missing) & fe(Year2) + x = reg(df, m) + @test coef(x) ≈ [-0.100863] atol = 1e-4 + @test stderror(x) ≈ [0.04149] atol = 1e-4 + @test x.nobs == 821 +end -# Missing -df1 = DataFrame(a=[1.0, 2.0, 3.0, 4.0], b=[5.0, 7.0, 11.0, 13.0]) -df2 = DataFrame(a=[1.0, missing, 3.0, 4.0], b=[5.0, 7.0, 11.0, 13.0]) -x = reg(df1, @formula(a ~ b)) -@test coef(x) ≈ [ -0.6500000000000004, 0.35000000000000003] atol = 1e-4 -x = reg(df1, @formula(b ~ a)) -@test coef(x) ≈ [2.0, 2.8] atol = 1e-4 -x = reg(df2,@formula(a ~ b)) -@test coef(x) ≈ [ -0.8653846153846163, 0.3653846153846155] atol = 1e-4 -x = reg(df2,@formula(b ~ a)) -@test coef(x) ≈ [ 2.4285714285714253, 2.7142857142857157] atol = 1e-4 - -# Works with Integers -df1 = DataFrame(a=[1, 2, 3, 4], b=[5, 7, 11, 13], c = categorical([1, 1, 2, 2])) -x = reg(df1, @formula(a ~ b)) -@test coef(x) ≈ [-0.65, 0.35] atol = 1e-4 -x = reg(df1, @formula(a ~ b + fe(c))) -@test coef(x) ≈ [0.5] atol = 1e-4 +@testset "missings" begin + + # Missing + df1 = DataFrame(a=[1.0, 2.0, 3.0, 4.0], b=[5.0, 7.0, 11.0, 13.0]) + df2 = DataFrame(a=[1.0, missing, 3.0, 4.0], b=[5.0, 7.0, 11.0, 13.0]) + x = reg(df1, @formula(a ~ b)) + @test coef(x) ≈ [ -0.6500000000000004, 0.35000000000000003] atol = 1e-4 + x = reg(df1, @formula(b ~ a)) + @test coef(x) ≈ [2.0, 2.8] atol = 1e-4 + x = reg(df2,@formula(a ~ b)) + @test coef(x) ≈ [ -0.8653846153846163, 0.3653846153846155] atol = 1e-4 + x = reg(df2,@formula(b ~ a)) + @test coef(x) ≈ [ 2.4285714285714253, 2.7142857142857157] atol = 1e-4 + + # Works with Integers + df1 = DataFrame(a=[1, 2, 3, 4], b=[5, 7, 11, 13], c = categorical([1, 1, 2, 2])) + x = reg(df1, @formula(a ~ b)) + @test coef(x) ≈ [-0.65, 0.35] atol = 1e-4 + x = reg(df1, @formula(a ~ b + fe(c))) + @test coef(x) ≈ [0.5] atol = 1e-4 +end diff --git a/test/formula.jl b/test/formula.jl index fc91412f..b75b50fe 100644 --- a/test/formula.jl +++ b/test/formula.jl @@ -4,61 +4,61 @@ using FixedEffectModels: parse_fixedeffect, _parse_fixedeffect, _multiply using FixedEffects import Base: == -==(x::FixedEffect{R,I}, y::FixedEffect{R,I}) where {R,I} = +function ==(x::FixedEffect, y::FixedEffect) x.refs == y.refs && x.interaction == y.interaction && x.n == y.n +end + +csvfile = CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv")) +df = DataFrame(csvfile) -@testset "parse_fixedeffect" begin - csvfile = CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv")) - df = DataFrame(csvfile) - # Any table type supporting the Tables.jl interface should work - for data in [df, csvfile] - @test _parse_fixedeffect(data, term(:Price)) === nothing - @test _parse_fixedeffect(data, ConstantTerm(1)) === nothing - @test _parse_fixedeffect(data, fe(:State)) == (FixedEffect(data.State), :fe_State, [:State]) - - @test _parse_fixedeffect(data, fe(:State)&term(:Year)) == - (FixedEffect(data.State, interaction=_multiply(data, [:Year])), Symbol("fe_State&Year"), [:State]) - @test _parse_fixedeffect(data, fe(:State)&fe(:Year)) == - (FixedEffect(data.State, data.Year), Symbol("fe_State&fe_Year"), [:State, :Year]) +# Any table type supporting the Tables.jl interface should work +for data in [df, csvfile] + @test _parse_fixedeffect(data, term(:Price)) === nothing + @test _parse_fixedeffect(data, ConstantTerm(1)) === nothing + @test _parse_fixedeffect(data, fe(:State)) == (FixedEffect(data.State), :fe_State, [:State]) + + @test _parse_fixedeffect(data, fe(:State)&term(:Year)) == + (FixedEffect(data.State, interaction=_multiply(data, [:Year])), Symbol("fe_State&Year"), [:State]) + @test _parse_fixedeffect(data, fe(:State)&fe(:Year)) == + (FixedEffect(data.State, data.Year), Symbol("fe_State&fe_Year"), [:State, :Year]) - @test parse_fixedeffect(data, ()) == (FixedEffect[], Symbol[], Symbol[], ()) - - f = @formula(y ~ 1 + Price) - ts1 = f.rhs - ts2 = term(1) + term(:Price) - @test parse_fixedeffect(data, f) == (FixedEffect[], Symbol[], Symbol[], f) - @test parse_fixedeffect(data, ts1) == (FixedEffect[], Symbol[], Symbol[], ts1) - @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) + @test parse_fixedeffect(data, ()) == (FixedEffect[], Symbol[], Symbol[], ()) + + f = @formula(y ~ 1 + Price) + ts1 = f.rhs + ts2 = term(1) + term(:Price) + @test parse_fixedeffect(data, f) == (FixedEffect[], Symbol[], Symbol[], f) + @test parse_fixedeffect(data, ts1) == (FixedEffect[], Symbol[], Symbol[], ts1) + @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) - fparsed = term(:y) ~ InterceptTerm{false}() + term(:Price) - tsparsed = (InterceptTerm{false}(), term(:Price)) + fparsed = term(:y) ~ InterceptTerm{false}() + term(:Price) + tsparsed = (InterceptTerm{false}(), term(:Price)) - f = @formula(y ~ 1 + Price + fe(State)) - ts1 = f.rhs - ts2 = term(1) + term(:Price) + fe(:State) - @test parse_fixedeffect(data, f) == ([FixedEffect(data.State)], [:fe_State], [:State], fparsed) - @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State)], [:fe_State], [:State], tsparsed) - @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) + f = @formula(y ~ 1 + Price + fe(State)) + ts1 = f.rhs + ts2 = term(1) + term(:Price) + fe(:State) + @test parse_fixedeffect(data, f) == ([FixedEffect(data.State)], [:fe_State], [:State], fparsed) + @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State)], [:fe_State], [:State], tsparsed) + @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) - f = @formula(y ~ Price + fe(State) + fe(Year)) - ts1 = f.rhs - ts2 = term(:Price) + fe(:State) + fe(:Year) - @test parse_fixedeffect(data, f) == ([FixedEffect(data.State), FixedEffect(data.Year)], [:fe_State, :fe_Year], [:State, :Year], fparsed) - @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State), FixedEffect(data.Year)], [:fe_State, :fe_Year], [:State, :Year], tsparsed) - @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) + f = @formula(y ~ Price + fe(State) + fe(Year)) + ts1 = f.rhs + ts2 = term(:Price) + fe(:State) + fe(:Year) + @test parse_fixedeffect(data, f) == ([FixedEffect(data.State), FixedEffect(data.Year)], [:fe_State, :fe_Year], [:State, :Year], fparsed) + @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State), FixedEffect(data.Year)], [:fe_State, :fe_Year], [:State, :Year], tsparsed) + @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) - f = @formula(y ~ Price + fe(State)&Year) - ts1 = f.rhs - ts2 = term(:Price) + fe(:State)&term(:Year) - @test parse_fixedeffect(data, f) == ([FixedEffect(data.State, interaction=_multiply(data, [:Year]))], [Symbol("fe_State&Year")], [:State], term(:y) ~ (term(:Price),)) - @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State, interaction=_multiply(data, [:Year]))], [Symbol("fe_State&Year")], [:State], (term(:Price),)) - @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) + f = @formula(y ~ Price + fe(State)&Year) + ts1 = f.rhs + ts2 = term(:Price) + fe(:State)&term(:Year) + @test parse_fixedeffect(data, f) == ([FixedEffect(data.State, interaction=_multiply(data, [:Year]))], [Symbol("fe_State&Year")], [:State], term(:y) ~ (term(:Price),)) + @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State, interaction=_multiply(data, [:Year]))], [Symbol("fe_State&Year")], [:State], (term(:Price),)) + @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) - f = @formula(y ~ Price + fe(State)*fe(Year)) - ts1 = f.rhs - ts2 = term(:Price) + fe(:State) + fe(:Year) + fe(:State)&fe(:Year) - @test parse_fixedeffect(data, f) == ([FixedEffect(data.State), FixedEffect(data.Year), FixedEffect(data.State, data.Year)], [:fe_State, :fe_Year, Symbol("fe_State&fe_Year")], [:State, :Year], fparsed) - @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State), FixedEffect(data.Year), FixedEffect(data.State, data.Year)], [:fe_State, :fe_Year, Symbol("fe_State&fe_Year")], [:State, :Year], tsparsed) - @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) - end + f = @formula(y ~ Price + fe(State)*fe(Year)) + ts1 = f.rhs + ts2 = term(:Price) + fe(:State) + fe(:Year) + fe(:State)&fe(:Year) + @test parse_fixedeffect(data, f) == ([FixedEffect(data.State), FixedEffect(data.Year), FixedEffect(data.State, data.Year)], [:fe_State, :fe_Year, Symbol("fe_State&fe_Year")], [:State, :Year], fparsed) + @test parse_fixedeffect(data, ts1) == ([FixedEffect(data.State), FixedEffect(data.Year), FixedEffect(data.State, data.Year)], [:fe_State, :fe_Year, Symbol("fe_State&fe_Year")], [:State, :Year], tsparsed) + @test parse_fixedeffect(data, ts2) == parse_fixedeffect(data, ts1) end diff --git a/test/partial_out.jl b/test/partial_out.jl index ddc01514..d8da23f7 100644 --- a/test/partial_out.jl +++ b/test/partial_out.jl @@ -1,24 +1,26 @@ using FixedEffectModels, DataFrames, Statistics, CSV, Test -df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) +@testset "partial out" begin + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) -@test Matrix(partial_out(df, @formula(Sales + Price ~ NDI))[1])[1:5, :] ≈ [ -37.2108 9.72654; -35.5599 9.87628; -32.309 8.82602; -34.2826 9.64653; -35.0526 8.84143] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)))[1])[1:5, :] ≈ [ -21.3715 -0.642783; -19.6571 -0.540335; -16.3427 -1.63789; -18.2631 -0.856975; -18.9784 -1.70283] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)))[1])[1:5, :] ≈ [-13.5767 -40.5467; -12.0767 -39.3467; -8.97667 -39.3467; -11.0767 -37.6467; -11.9767 -37.5467] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ 1))[1])[1:5, :] ≈ [ -30.0509 -40.0999; -28.5509 -38.8999; -25.4509 -38.8999; -27.5509 -37.1999; -28.4509 -37.0999] atol = 1e-3 -@test mean(Matrix(partial_out(df, @formula(Sales + Price ~ NDI), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 -@test mean(Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 -@test mean(Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 -@test mean(Matrix(partial_out(df, @formula(Sales + Price ~ 1), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ NDI), weights = :Pop)[1])[1:5, :] ≈[ -37.5296 11.8467; -35.8224 11.9922; -32.5151 10.9377; -34.4416 11.7546; -35.163 10.9459] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ NDI))[1])[1:5, :] ≈ [ -37.2108 9.72654; -35.5599 9.87628; -32.309 8.82602; -34.2826 9.64653; -35.0526 8.84143] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)))[1])[1:5, :] ≈ [ -21.3715 -0.642783; -19.6571 -0.540335; -16.3427 -1.63789; -18.2631 -0.856975; -18.9784 -1.70283] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)))[1])[1:5, :] ≈ [-13.5767 -40.5467; -12.0767 -39.3467; -8.97667 -39.3467; -11.0767 -37.6467; -11.9767 -37.5467] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ 1))[1])[1:5, :] ≈ [ -30.0509 -40.0999; -28.5509 -38.8999; -25.4509 -38.8999; -27.5509 -37.1999; -28.4509 -37.0999] atol = 1e-3 + @test mean(Matrix(partial_out(df, @formula(Sales + Price ~ NDI), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 + @test mean(Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 + @test mean(Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 + @test mean(Matrix(partial_out(df, @formula(Sales + Price ~ 1), add_mean = true)[1]), dims = 1[1]) ≈ [123.951 68.6999] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ NDI), weights = :Pop)[1])[1:5, :] ≈[ -37.5296 11.8467; -35.8224 11.9922; -32.5151 10.9377; -34.4416 11.7546; -35.163 10.9459] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ NDI), weights = :Pop, add_mean = true)[1])[1:5, :] ≈ [82.7448 85.3569; 84.4521 85.5024; 87.7593 84.4479; 85.8329 85.2649; 85.1115 84.4561] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ NDI), weights = :Pop, add_mean = true)[1])[1:5, :] ≈ [82.7448 85.3569; 84.4521 85.5024; 87.7593 84.4479; 85.8329 85.2649; 85.1115 84.4561] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -22.2429 -1.2635 ; -20.5296 -1.1515 ; -17.2164 -2.23949; -19.1378 -1.45057; -19.854 -2.28819] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -14.0383 -43.1224; -12.5383 -41.9224; -9.43825 -41.9224; -11.5383 -40.2224; -12.4383 -40.1224] atol = 1e-3 -@test Matrix(partial_out(df, @formula(Sales + Price ~ 1), weights = :Pop)[1])[1:5, :] ≈ [ -26.3745 -44.9103; -24.8745 -43.7103; -21.7745 -43.7103; -23.8745 -42.0103; -24.7745 -41.9103] atol = 1e-3 - + @test Matrix(partial_out(df, @formula(Sales + Price ~ NDI + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -22.2429 -1.2635 ; -20.5296 -1.1515 ; -17.2164 -2.23949; -19.1378 -1.45057; -19.854 -2.28819] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ 1 + fe(State)), weights = :Pop)[1])[1:5, :] ≈ [ -14.0383 -43.1224; -12.5383 -41.9224; -9.43825 -41.9224; -11.5383 -40.2224; -12.4383 -40.1224] atol = 1e-3 + @test Matrix(partial_out(df, @formula(Sales + Price ~ 1), weights = :Pop)[1])[1:5, :] ≈ [ -26.3745 -44.9103; -24.8745 -43.7103; -21.7745 -43.7103; -23.8745 -42.0103; -24.7745 -41.9103] atol = 1e-3 +end + diff --git a/test/predict.jl b/test/predict.jl index 1989b960..6d600e5f 100644 --- a/test/predict.jl +++ b/test/predict.jl @@ -1,312 +1,315 @@ using FixedEffectModels, DataFrames, CategoricalArrays, CSV, Test -df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) -df.StateC = categorical(df.State) -############################################################################## -## -## Printing Results -## -############################################################################## +@testset "print results" begin -model = @formula Sales ~ NDI -result = reg(df, model) -show(result) -predict(result, df) -residuals(result, df) -@test responsename(result) == "Sales" + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + model = @formula Sales ~ NDI + result = reg(df, model) + show(result) + predict(result, df) + residuals(result, df) + @test responsename(result) == "Sales" -model = @formula Sales ~ CPI + (Price ~ Pimin) -result = reg(df, model) -coeftable(result) -show(result) -predict(result, df) -residuals(result, df) -@test nobs(result) == 1380 -@test vcov(result)[1] ≈ 3.5384578251636785 - -# predict with interactions -model = @formula Sales ~ CPI * Pop -result = reg(df, model) -@test predict(result, df)[1] ≈ 131.92991 + model = @formula Sales ~ CPI + (Price ~ Pimin) + result = reg(df, model) + coeftable(result) + show(result) + predict(result, df) + residuals(result, df) + @test nobs(result) == 1380 + @test vcov(result)[1] ≈ 3.5384578251636785 -model = @formula Sales ~ Price + fe(State) -result = reg(df, model) -show(result) -model = @formula Sales ~ CPI + (Price ~ Pimin) + fe(State) -result = reg(df, model) -show(result) - - -model = @formula Sales ~ Price + StateC -result = reg(df, model) -@test predict(result, df)[1] ≈ 115.9849874 - -#model = @formula Sales ~ Price + fe(State) -#result = reg(df, model, save = :fe) -#@test predict(result)[1] ≈ 115.9849874 - -model = @formula Sales ~ Price * Pop + StateC -result = reg(df, model) -@test predict(result, df)[1] ≈ 115.643985352 - -#model = @formula Sales ~ Price * Pop + fe(State) -#result = reg(df, model, save = :fe) -#@test predict(result, df)[1] ≈ 115.643985352 - -model = @formula Sales ~ Price + Pop + Price & Pop + StateC -result = reg(df, model) -@test predict(result, df)[1] ≈ 115.643985352 - -#model = @formula Sales ~ Price + Pop + Price & Pop + fe(State) -#result = reg(df, model, save = :fe) -#@test predict(result, df)[1] ≈ 115.643985352 - - - - - -# Tests for predict method -# Test that predicting from model without saved FE test throws -model = @formula Sales ~ Price + fe(State) -result = reg(df, model) -@test_throws "No estimates for fixed effects found. Fixed effects need to be estimated using the option save = :fe or :all for prediction to work." predict(result, df) - -# Test basic functionality - adding 1 to price should increase prediction by coef -#model = @formula Sales ~ Price + fe(State) -#result = reg(df, model, save = :fe) -#x = predict(result, DataFrame(Price = [1.0, 2.0], State = [1, 1])) -#@test last(x) - first(x) ≈ only(result.coef) - -# Missing variables in covariates should yield missing prediction -#x = predict(result, DataFrame(Price = [1.0, missing], State = [1, 1])) -#@test ismissing(last(x)) - -# Missing variables in fixed effects should yield missing prediction -#x = predict(result, DataFrame(Price = [1.0, 2.0], State = [1, missing])) -#@test ismissing(last(x)) - -# Fixed effect levels not in the estimation data should yield missing prediction -#x = predict(result, DataFrame(Price = [1.0, 2.0], State = [1, 111])) -#@test ismissing(last(x)) - -############################################################################## -## -## Saved Residuals -## -############################################################################## - -model = @formula Sales ~ Price -result = reg(df, model) -@test residuals(result, df)[1:10] ≈ [-39.2637, -37.48801, -34.38801, -36.09743, -36.97446, -43.15547, -41.22573, -40.83648, -34.52427, -28.91617] atol = 1e-4 -@test r2(result) ≈ 0.0968815737054879 atol = 1e-4 -@test adjr2(result) ≈ 0.0962261902321246 atol = 1e-4 -@test result.nobs == 1380 -@test result.F ≈ 147.8242550248069 atol= 1e-4 - -#weights -model = @formula Sales ~ CPI -result = reg(df, model, weights = :Pop) -@test residuals(result, df)[1:3] ≈ [ -35.641449, -34.0611538, -30.860784] atol = 1e-4 - -# iv -model = @formula Sales ~ CPI + (Price ~ Pimin) -result = reg(df, model) -@test residuals(result, df)[1:3] ≈ [ -33.047390, -30.9518422, -28.1371048] atol = 1e-4 - -# iv with exo after endo -model = @formula Sales ~ (Price ~ Pimin) + CPI -result = reg(df, model) -@test residuals(result, df)[1:3] ≈ [ -33.047390, -30.9518422, -28.1371048] atol = 1e-4 - -# iv and weights -model = @formula Sales ~ CPI + (Price ~ Pimin) -result = reg(df, model, weights = :Pop) -@test residuals(result, df)[1:3] ≈ [ -30.2284549, -28.09507, -25.313248] atol = 1e-4 - -# iv, weights and subset of states -model = @formula Sales ~ CPI + (Price ~ Pimin) -result = reg(df, model, subset = df.State .<= 30, weights = :Pop) -@test residuals(result, df)[1:3] ≈ [ -34.081720, -31.906020, -29.131738] atol = 1e-4 - - -# fixed effects -model = @formula Sales ~ Price + fe(State) -result = reg(df, model, save = true) -@test residuals(result)[1:3] ≈ [-22.08499, -20.33318, -17.23318] atol = 1e-4 -@test result.nobs == 1380 -@test r2(result) ≈ 0.7682403747044817 atol = 1e-4 -@test adjr2(result) ≈ 0.7602426682051615 atol = 1e-4 -@test result.F ≈ 458.4582526109375 atol = 1e-4 - -# fixed effects and weights -model = @formula Sales ~ Price + fe(State) -result = reg(df, model, weights = :Pop, save = true) -@test residuals(result)[1:3] ≈ [ -23.413793, -21.65289, -18.55289] atol = 1e-4 - -# fixed effects and iv -#TO CHECK WITH IVREGHDFE, NO SUPPORT RIGHT NOW -model = @formula Sales ~ CPI + (Price ~ Pimin) + fe(State) -result = reg(df, model, save = true) -@test residuals(result)[1:3] ≈ [ -16.925748, -14.835710, -12.017037] atol = 1e-4 - -#r2 with weights when saving residuals -m = @formula Sales ~ Price -result = reg(df, save = :residuals, weights = :Pop, m) -@test r2(result) ≈ 0.24654260 atol = 1e-4 - - -# test different arguments for the keyword argument save -model = @formula Sales ~ Price + fe(State) -result = reg(df, model, save = true) -@test residuals(result) !== nothing -@test "fe_State" ∈ names(fe(result)) - -model = @formula Sales ~ Price + fe(State) -result = reg(df, model, save = :residuals) -@test residuals(result) !== nothing -@test "fe_State" ∉ names(fe(result)) - -model = @formula Sales ~ Price + fe(State) -result = reg(df, model, save = :fe) -@test "fe_State" ∈ names(fe(result)) - - -# iv recategorized -df.Pimin2 = df.Pimin -m = @formula Sales ~ (Pimin2 + Price ~ NDI + Pimin) -result = reg(df, m) -yhat = predict(result, df) -res = residuals(result, df) - -m2 = @formula Sales ~ Pimin2 + (Price ~ NDI + Pimin) -result2 = reg(df, m2) -yhat2 = predict(result2, df) -res2 = residuals(result2, df) -@test yhat ≈ yhat2 -@test res ≈ res2 - -m3 = @formula Sales ~ Pimin2 + (Price ~ NDI) -result3 = reg(df, m3) -yhat3 = predict(result3, df) -res3 = residuals(result3, df) -@test yhat ≈ yhat3 -@test res ≈ res3 - -m4 = @formula Sales ~ (Price + Pimin2 ~ NDI + Pimin) -result4 = reg(df, m4) -yhat4 = predict(result4, df) -res4 = residuals(result4, df) -@test yhat ≈ yhat4 -@test res ≈ res4 - -m5 = @formula Sales ~ (Price ~ NDI + Pimin) + Pimin2 -result5 = reg(df, m5) -yhat5 = predict(result5, df) -res5 = residuals(result5, df) -@test yhat ≈ yhat5 -@test res ≈ res5 - - - -############################################################################## -## -## Saved FixedEffects -## -############################################################################## -# check save does not change r2 -model1 = @formula Sales ~ Price -result1 = reg(df, model1, weights = :Pop) -model2 = @formula Sales ~ Price -result2 = reg(df, model2, weights = :Pop) -@test r2(result1) ≈ r2(result2) - - - -model = @formula Sales ~ Price + fe(Year) -result = reg(df, model, save = true) -@test fe(result)[1, :fe_Year] ≈ 164.77833189721005 -@test size(fe(result), 2) == 1 -@test size(fe(result, keepkeys = true), 2) == 2 - -model = @formula Sales ~ Price + fe(Year) + fe(State) -result = reg(df, model, save = true) -@test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 140.6852 atol = 1e-3 - -model = @formula Sales ~ Price + Year&fe(State) -result = reg(df, model, save = true) -@test fe(result)[1, Symbol("fe_State&Year")] ≈ 1.742779 atol = 1e-3 - -model = @formula Sales ~ Price + fe(State) + Year&fe(State) -result = reg(df, model, save = true) -@test fe(result)[1, :fe_State] ≈ -91.690635 atol = 1e-1 - -model = @formula Sales ~ Price + fe(State) -result = reg(df, model, subset = df.State .<= 30, save = true) -@test fe(result)[1, :fe_State] ≈ 124.913976 atol = 1e-1 -@test ismissing(fe(result)[1380 , :fe_State]) - -model = @formula Sales ~ Price + fe(Year) -result = reg(df, model, weights = :Pop, save = true) -@test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 - -# fixed effects -df.Price2 = df.Price -model = @formula Sales ~ Price + Price2 + fe(Year) -result = reg(df, model, save = true) -@test fe(result)[1, :fe_Year] ≈ 164.77833189721005 - -# iv -model = @formula Sales ~ (State ~ Price) + fe(Year) -result = reg(df, model, save = true) -@test fe(result)[1, :fe_Year] ≈ -167.48093490413623 - -# weights -model = @formula Sales ~ Price + fe(Year) -result = reg(df, model, weights = :Pop, save = true) -@test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 - -# IV and weights -model = @formula Sales ~ (Price ~ Pimin) + fe(Year) -result = reg(df, model, weights = :Pop, save = true) -@test fe(result)[1, :fe_Year] ≈ 168.24688 atol = 1e-4 - - -# IV, weights and both year and state fixed effects -model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) -result = reg(df, model, weights = :Pop, save = true) -@test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State]≈ 147.84145 atol = 1e-4 - - -# subset with IV -model = @formula Sales ~ (Price ~ Pimin) + fe(Year) -result = reg(df, model, subset = df.State .<= 30, save = true) -@test fe(result)[1, :fe_Year] ≈ 164.05245824240276 atol = 1e-4 -@test ismissing(fe(result)[811, :fe_Year]) - - -# subset with IV, weights and year fixed effects -model = @formula Sales ~ (Price ~ Pimin) + fe(Year) -result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true) -@test fe(result)[1, :fe_Year] ≈ 182.71915 atol = 1e-4 - -# subset with IV, weights and year fixed effects -model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) -result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true) -@test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 158.91798 atol = 1e-4 - -methods_vec = [:cpu] -if FixedEffectModels.FixedEffects.has_CUDA() - push!(methods_vec, :gpu) + # predict with interactions + model = @formula Sales ~ CPI * Pop + result = reg(df, model) + @test predict(result, df)[1] ≈ 131.92991 + + + model = @formula Sales ~ Price + fe(State) + result = reg(df, model) + show(result) + model = @formula Sales ~ CPI + (Price ~ Pimin) + fe(State) + result = reg(df, model) + show(result) end -for method in methods_vec - local model = @formula Sales ~ Price + fe(Year) - local result = reg(df, model, save = true, method = method, double_precision = false) - @test fe(result)[1, :fe_Year] ≈ 164.7 atol = 1e-1 + +@testset "predict" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + + model = @formula Sales ~ Price + StateC + result = reg(df, model) + @test predict(result, df)[1] ≈ 115.9849874 + + #model = @formula Sales ~ Price + fe(State) + #result = reg(df, model, save = :fe) + #@test predict(result)[1] ≈ 115.9849874 + + model = @formula Sales ~ Price * Pop + StateC + result = reg(df, model) + @test predict(result, df)[1] ≈ 115.643985352 + + #model = @formula Sales ~ Price * Pop + fe(State) + #result = reg(df, model, save = :fe) + #@test predict(result, df)[1] ≈ 115.643985352 + + model = @formula Sales ~ Price + Pop + Price & Pop + StateC + result = reg(df, model) + @test predict(result, df)[1] ≈ 115.643985352 + + #model = @formula Sales ~ Price + Pop + Price & Pop + fe(State) + #result = reg(df, model, save = :fe) + #@test predict(result, df)[1] ≈ 115.643985352 + + + + + + # Tests for predict method + # Test that predicting from model without saved FE test throws + model = @formula Sales ~ Price + fe(State) + result = reg(df, model) + @test_throws "No estimates for fixed effects found. Fixed effects need to be estimated using the option save = :fe or :all for prediction to work." predict(result, df) + + # Test basic functionality - adding 1 to price should increase prediction by coef + #model = @formula Sales ~ Price + fe(State) + #result = reg(df, model, save = :fe) + #x = predict(result, DataFrame(Price = [1.0, 2.0], State = [1, 1])) + #@test last(x) - first(x) ≈ only(result.coef) + + # Missing variables in covariates should yield missing prediction + #x = predict(result, DataFrame(Price = [1.0, missing], State = [1, 1])) + #@test ismissing(last(x)) + + # Missing variables in fixed effects should yield missing prediction + #x = predict(result, DataFrame(Price = [1.0, 2.0], State = [1, missing])) + #@test ismissing(last(x)) + + # Fixed effect levels not in the estimation data should yield missing prediction + #x = predict(result, DataFrame(Price = [1.0, 2.0], State = [1, 111])) + #@test ismissing(last(x)) end +@testset "residuals" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + + model = @formula Sales ~ Price + result = reg(df, model) + @test residuals(result, df)[1:10] ≈ [-39.2637, -37.48801, -34.38801, -36.09743, -36.97446, -43.15547, -41.22573, -40.83648, -34.52427, -28.91617] atol = 1e-4 + @test r2(result) ≈ 0.0968815737054879 atol = 1e-4 + @test adjr2(result) ≈ 0.0962261902321246 atol = 1e-4 + @test result.nobs == 1380 + @test result.F ≈ 147.8242550248069 atol= 1e-4 + + #weights + model = @formula Sales ~ CPI + result = reg(df, model, weights = :Pop) + @test residuals(result, df)[1:3] ≈ [ -35.641449, -34.0611538, -30.860784] atol = 1e-4 + + # iv + model = @formula Sales ~ CPI + (Price ~ Pimin) + result = reg(df, model) + @test residuals(result, df)[1:3] ≈ [ -33.047390, -30.9518422, -28.1371048] atol = 1e-4 + + # iv with exo after endo + model = @formula Sales ~ (Price ~ Pimin) + CPI + result = reg(df, model) + @test residuals(result, df)[1:3] ≈ [ -33.047390, -30.9518422, -28.1371048] atol = 1e-4 + + # iv and weights + model = @formula Sales ~ CPI + (Price ~ Pimin) + result = reg(df, model, weights = :Pop) + @test residuals(result, df)[1:3] ≈ [ -30.2284549, -28.09507, -25.313248] atol = 1e-4 + + # iv, weights and subset of states + model = @formula Sales ~ CPI + (Price ~ Pimin) + result = reg(df, model, subset = df.State .<= 30, weights = :Pop) + @test residuals(result, df)[1:3] ≈ [ -34.081720, -31.906020, -29.131738] atol = 1e-4 + + + # fixed effects + model = @formula Sales ~ Price + fe(State) + result = reg(df, model, save = true) + @test residuals(result)[1:3] ≈ [-22.08499, -20.33318, -17.23318] atol = 1e-4 + @test result.nobs == 1380 + @test r2(result) ≈ 0.7682403747044817 atol = 1e-4 + @test adjr2(result) ≈ 0.7602426682051615 atol = 1e-4 + @test result.F ≈ 458.4582526109375 atol = 1e-4 + + # fixed effects and weights + model = @formula Sales ~ Price + fe(State) + result = reg(df, model, weights = :Pop, save = true) + @test residuals(result)[1:3] ≈ [ -23.413793, -21.65289, -18.55289] atol = 1e-4 + + # fixed effects and iv + #TO CHECK WITH IVREGHDFE, NO SUPPORT RIGHT NOW + model = @formula Sales ~ CPI + (Price ~ Pimin) + fe(State) + result = reg(df, model, save = true) + @test residuals(result)[1:3] ≈ [ -16.925748, -14.835710, -12.017037] atol = 1e-4 + + #r2 with weights when saving residuals + m = @formula Sales ~ Price + result = reg(df, save = :residuals, weights = :Pop, m) + @test r2(result) ≈ 0.24654260 atol = 1e-4 + + + # test different arguments for the keyword argument save + model = @formula Sales ~ Price + fe(State) + result = reg(df, model, save = true) + @test residuals(result) !== nothing + @test "fe_State" ∈ names(fe(result)) + + model = @formula Sales ~ Price + fe(State) + result = reg(df, model, save = :residuals) + @test residuals(result) !== nothing + @test "fe_State" ∉ names(fe(result)) + + model = @formula Sales ~ Price + fe(State) + result = reg(df, model, save = :fe) + @test "fe_State" ∈ names(fe(result)) + + + # iv recategorized + df.Pimin2 = df.Pimin + m = @formula Sales ~ (Pimin2 + Price ~ NDI + Pimin) + result = reg(df, m) + yhat = predict(result, df) + res = residuals(result, df) + + m2 = @formula Sales ~ Pimin2 + (Price ~ NDI + Pimin) + result2 = reg(df, m2) + yhat2 = predict(result2, df) + res2 = residuals(result2, df) + @test yhat ≈ yhat2 + @test res ≈ res2 + + m3 = @formula Sales ~ Pimin2 + (Price ~ NDI) + result3 = reg(df, m3) + yhat3 = predict(result3, df) + res3 = residuals(result3, df) + @test yhat ≈ yhat3 + @test res ≈ res3 + + m4 = @formula Sales ~ (Price + Pimin2 ~ NDI + Pimin) + result4 = reg(df, m4) + yhat4 = predict(result4, df) + res4 = residuals(result4, df) + @test yhat ≈ yhat4 + @test res ≈ res4 + + m5 = @formula Sales ~ (Price ~ NDI + Pimin) + Pimin2 + result5 = reg(df, m5) + yhat5 = predict(result5, df) + res5 = residuals(result5, df) + @test yhat ≈ yhat5 + @test res ≈ res5 +end + + +@testset "saved fixed effects" begin + + df = DataFrame(CSV.File(joinpath(dirname(pathof(FixedEffectModels)), "../dataset/Cigar.csv"))) + df.StateC = categorical(df.State) + + # check save does not change r2 + model1 = @formula Sales ~ Price + result1 = reg(df, model1, weights = :Pop) + model2 = @formula Sales ~ Price + result2 = reg(df, model2, weights = :Pop) + @test r2(result1) ≈ r2(result2) + + + + model = @formula Sales ~ Price + fe(Year) + result = reg(df, model, save = true) + @test fe(result)[1, :fe_Year] ≈ 164.77833189721005 + @test size(fe(result), 2) == 1 + @test size(fe(result, keepkeys = true), 2) == 2 + + model = @formula Sales ~ Price + fe(Year) + fe(State) + result = reg(df, model, save = true) + @test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 140.6852 atol = 1e-3 + + model = @formula Sales ~ Price + Year&fe(State) + result = reg(df, model, save = true) + @test fe(result)[1, Symbol("fe_State&Year")] ≈ 1.742779 atol = 1e-3 + + model = @formula Sales ~ Price + fe(State) + Year&fe(State) + result = reg(df, model, save = true) + @test fe(result)[1, :fe_State] ≈ -91.690635 atol = 1e-1 + + model = @formula Sales ~ Price + fe(State) + result = reg(df, model, subset = df.State .<= 30, save = true) + @test fe(result)[1, :fe_State] ≈ 124.913976 atol = 1e-1 + @test ismissing(fe(result)[1380 , :fe_State]) + + model = @formula Sales ~ Price + fe(Year) + result = reg(df, model, weights = :Pop, save = true) + @test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 + + # fixed effects + df.Price2 = df.Price + model = @formula Sales ~ Price + Price2 + fe(Year) + result = reg(df, model, save = true) + @test fe(result)[1, :fe_Year] ≈ 164.77833189721005 + + # iv + model = @formula Sales ~ (State ~ Price) + fe(Year) + result = reg(df, model, save = true) + @test fe(result)[1, :fe_Year] ≈ -167.48093490413623 + + # weights + model = @formula Sales ~ Price + fe(Year) + result = reg(df, model, weights = :Pop, save = true) + @test fe(result)[2, :fe_Year] - fe(result)[1, :fe_Year] ≈ -3.0347149502496222 + + # IV and weights + model = @formula Sales ~ (Price ~ Pimin) + fe(Year) + result = reg(df, model, weights = :Pop, save = true) + @test fe(result)[1, :fe_Year] ≈ 168.24688 atol = 1e-4 + + + # IV, weights and both year and state fixed effects + model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) + result = reg(df, model, weights = :Pop, save = true) + @test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State]≈ 147.84145 atol = 1e-4 + + + # subset with IV + model = @formula Sales ~ (Price ~ Pimin) + fe(Year) + result = reg(df, model, subset = df.State .<= 30, save = true) + @test fe(result)[1, :fe_Year] ≈ 164.05245824240276 atol = 1e-4 + @test ismissing(fe(result)[811, :fe_Year]) + + + # subset with IV, weights and year fixed effects + model = @formula Sales ~ (Price ~ Pimin) + fe(Year) + result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true) + @test fe(result)[1, :fe_Year] ≈ 182.71915 atol = 1e-4 + + # subset with IV, weights and year fixed effects + model = @formula Sales ~ (Price ~ Pimin) + fe(State) + fe(Year) + result = reg(df, model, subset = df.State .<= 30, weights = :Pop, save = true) + @test fe(result)[1, :fe_Year] + fe(result)[1, :fe_State] ≈ 158.91798 atol = 1e-4 + + methods_vec = [:cpu] + if FixedEffectModels.FixedEffects.has_CUDA() + push!(methods_vec, :gpu) + end + for method in methods_vec + local model = @formula Sales ~ Price + fe(Year) + local result = reg(df, model, save = true, method = method, double_precision = false) + @test fe(result)[1, :fe_Year] ≈ 164.7 atol = 1e-1 + end +end + diff --git a/test/runtests.jl b/test/runtests.jl index cf458e85..eebbe167 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,27 +1,7 @@ -using FixedEffectModels +using FixedEffectModels, Test -tests = [ - "formula.jl", - "fit.jl", - "predict.jl", - "partial_out.jl", - "collinearity.jl" - ] - -println("Running tests:") - - -for test in tests - try - include(test) - println("\t\033[1m\033[32mPASSED\033[0m: $(test)") - catch e - println("\t\033[1m\033[31mFAILED\033[0m: $(test)") - showerror(stdout, e, backtrace()) - rethrow(e) - end -end - - -using Test -@test Vcov.pinvertible(Symmetric([1.0 1.0; 1.0 1.0])) ≈ [1.0 1.0; 1.0 1.0] \ No newline at end of file +@testset "formula" begin include("formula.jl") end +@testset "fit" begin include("fit.jl") end +@testset "predict" begin include("predict.jl") end +@testset "partial out" begin include("partial_out.jl") end +@testset "collinearity" begin include("collinearity.jl") end