Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/poisson regression #73

Merged
merged 3 commits into from
Jan 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
package whu.edu.cn.algorithms.SpatialStats.SpatialRegression

import breeze.linalg.{DenseMatrix, DenseVector}
import breeze.linalg.{DenseMatrix, DenseVector, max}
import breeze.linalg._
import breeze.numerics._
import breeze.stats.distributions.Poisson
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.locationtech.jts.geom.Geometry
import whu.edu.cn.algorithms.SpatialStats.GWModels.Algorithm
import whu.edu.cn.algorithms.SpatialStats.SpatialRegression.LogisticRegression.setX
import whu.edu.cn.oge.Service

import scala.collection.mutable

Expand Down Expand Up @@ -38,9 +45,206 @@ object PoissonRegression extends Algorithm {
_dvecY = DenseVector(_data.map(t => t(property).asInstanceOf[String].toDouble).collect())
}

def poissonRegression(x: Array[DenseVector[Double]], y: DenseVector[Double]): Unit = {
def fit(sc: SparkContext, data: RDD[(String, (Geometry, mutable.Map[String, Any]))],
y: String, x: String, Intercept: Boolean = true,
maxIter: Int = 100,epsilon: Double = 1e-8): Unit = {
_data = data.map(t => t._2._2)
setX(x)
setY(y)
val X = if (Intercept) _dmatX else _rawdX
val Y = _dvecY

// 检查数据有效性
if (Y.toArray.exists(_ < 0)) {
throw new IllegalArgumentException("negative values not allowed for the 'Poisson' family")
}

// 初始化
// R中使用 mustart <- y + 0.1
var mu = Y + 0.1
var eta = mu.map(math.log) // linkfun = log
var beta = DenseVector.zeros[Double](X.cols)
val weights = DenseVector.ones[Double](Y.length) // prior weights

var devold = Double.PositiveInfinity
var dev = calculateDeviance(Y, mu, weights)
var iter = 0
var converged = false

while (iter < maxIter && !converged) {
iter += 1
devold = dev

val varmu = mu.copy // variance = mu for Poisson
val muEta = mu.copy // mu.eta = mu for log link
// adjusted response
val z = eta + (Y - mu)/muEta
// working weights
val w = muEta
//val w = multipleByElement(weights, divideByElement(varmu, mu.map(x => math.pow(math.log(x), 2))))// 创建权重矩阵

val W = DenseMatrix.zeros[Double](Y.length, Y.length)
for (i <- 0 until Y.length) {
W(i, i) = math.sqrt(w(i))
}

try {
val Xw = W * X
val zw = W * z
// 使用QR分解求解加权最小二乘
val qr = breeze.linalg.qr.reduced(Xw)
beta = breeze.linalg.inv(qr.r) * (qr.q.t * zw)
//beta = breeze.linalg.backsolve(qr.r, qr.q.t * zw)
// 更新线性预测值
eta = X * beta
// 更新均值 (linkinv = exp)
mu = eta.map(math.exp)
// 计算新的偏差
dev = calculateDeviance(Y, mu, weights)
// 检查收敛性
converged = math.abs((dev - devold) / (dev + 0.1)) < epsilon

} catch {
case e: Exception =>
println(s"Error in iteration $iter: ${e.getMessage}")
converged = true
}
}

//yhat, residual
val yhat = exp(X * beta)
val res = (Y - yhat)

// deviance residuals
val devRes = DenseVector.zeros[Double](Y.length)
for (i <- 0 until Y.length) {
val yi = Y(i)
val yhat_i = yhat(i)

if(yi==0){
devRes(i) = -math.sqrt(2.0 * yhat_i)
}else{
val dev = 2.0 * (yi * math.log(yi/yhat_i) - (yi - yhat_i))
devRes(i) = math.signum(yi-yhat_i) * math.sqrt(dev)
}
}

// printed string
var str = "\n********************Results of Poisson Regression********************\n"

var formula = f"${y} ~ "
val X_max = if(Intercept) X.cols else X.cols + 1
for (i <- 1 until X_max) {
if (i == 1) {
formula += f"${_nameX(i - 1)} "
} else {
formula += f"+ ${_nameX(i - 1)} "
}
}
str += "Formula:\n" + formula + f"\n"

str += "\n"
str += f"Deviance Residuals: \n" +
f"min: ${devRes.toArray.min.formatted("%.4f")} " +
f"max: ${devRes.toArray.max.formatted("%.4f")} " +
f"mean: ${breeze.stats.mean(devRes).formatted("%.4f")} " +
f"median: ${breeze.stats.median(devRes).formatted("%.4f")}\n"

str += "\n"
str += "Coefficients:\n"
if (Intercept) {
str += f"Intercept:${beta(0).formatted("%.6f")}\n"
for (i <- 1 until (X.cols)) {
str += f"${_nameX(i - 1)}: ${beta(i).formatted("%.6f")}\n"
}
}else{
for (i <- 0 until (X.cols)) {
str += f"${_nameX(i)}: ${beta(i).formatted("%.6f")}\n"
}
}// need to fix at linear and logistic

str += diagnostic(X, Y, devRes, _df, mu,weights,Intercept)

str += "\n"
str += f"Number of Iterations: ${iter}\n"

str += "**********************************************************************\n"

Service.print(str,"Poisson Regression for feature","String")
}

protected def diagnostic(X: DenseMatrix[Double], Y: DenseVector[Double], devRes: DenseVector[Double], df: Double,
mu: DenseVector[Double], weights: DenseVector[Double], Intercept: Boolean ): String = {

val n = X.rows.toDouble
val p = df

// deviance of null model, y > 0
val y_mean = breeze.stats.mean(Y)
val null_deviance = if(Intercept){
Y.toArray.map(yi => -2 * yi* math.log(y_mean/yi)).sum
} else {
Y.toArray.zip(mu.toArray).map{case(yi,m) => {
//-2 * math.log(m)
2 * (yi * math.log(yi) - (yi))
}}.sum + 2 * n
}

// deviance redisuals square sum
val residual_deviance = sum(devRes.map(x => x * x))

//AIC
val aic = calculateAIC(Y.toArray,n.toInt,mu.toArray,weights.toArray, p, Intercept)

// degree of freedom
val null_df = if(Intercept) n - 1 else n
val residual_df = if(Intercept) n - p - 1 else n - p

"\nDiagnostics:\n"+
f"Null deviance: $null_deviance%.2f on $null_df%.0f degrees of freedom\n" +
f"Residual deviance: $residual_deviance%.2f on $residual_df%.0f degrees of freedom\n" +
f"AIC: $aic%.2f\n"
//f"Pseudo R-squared: $nagelkerke_r2%.4f\n"
}

private def calculateDeviance(y: DenseVector[Double], mu: DenseVector[Double],
weights: DenseVector[Double]): Double = {
var dev = 0.0
for (i <- 0 until y.length) {
if (y(i) > 0) {
dev += weights(i) * (y(i) * math.log(y(i)/mu(i)) - (y(i) - mu(i)))
} else {
dev += weights(i) * mu(i)
}
}
2.0 * dev
}

private def multipleByElement (a: DenseVector[Double],b:DenseVector[Double]): DenseVector[Double] = {
val res = a.copy
for(i<- 0 until res.length){
res(i) = a(i)*b(i)
}
res
}

private def divideByElement(a: DenseVector[Double], b: DenseVector[Double]): DenseVector[Double] = {
val res = a.copy
for (i <- 0 until res.length) {
res(i) = a(i) / b(i)
}
res
}

def calculateAIC(y: Array[Double], n: Int, mu: Array[Double], wt: Array[Double], p: Double, Intercept: Boolean): Double = {
// 计算 AIC
val aic = -2 * (y.zip(mu).zip(wt).map { case ((yValue, muValue), weight) =>
if (yValue > 0) {
Poisson(muValue).logProbabilityOf(yValue.toInt) * weight // 计算对数似然值并乘以权重
} else {
0.0 // 对于 y <= 0 的情况,返回0
}
}.sum) + 2 * (p + 1)
if (Intercept) aic else aic - 2 // intercept = false, p+1 => p
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@ import breeze.linalg.{DenseMatrix, DenseVector, norm, normalize}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.locationtech.jts.geom.{Coordinate, Point}
import whu.edu.cn.algorithms.SpatialStats.BasicStatistics.{AverageNearestNeighbor, DescriptiveStatistics, RipleysK, PrincipalComponentAnalysis}
import whu.edu.cn.algorithms.SpatialStats.BasicStatistics.{AverageNearestNeighbor, DescriptiveStatistics, PrincipalComponentAnalysis, RipleysK}
import whu.edu.cn.algorithms.SpatialStats.STCorrelations.CorrelationAnalysis.corrMat
import whu.edu.cn.algorithms.SpatialStats.STCorrelations.TemporalAutoCorrelation._
import whu.edu.cn.algorithms.SpatialStats.SpatialRegression.LinearRegression
import whu.edu.cn.algorithms.SpatialStats.SpatialRegression.LogisticRegression
import whu.edu.cn.algorithms.SpatialStats.SpatialRegression.{SpatialDurbinModel, SpatialErrorModel, SpatialLagModel}
import whu.edu.cn.algorithms.SpatialStats.SpatialRegression.{LinearRegression, LogisticRegression, PoissonRegression, SpatialDurbinModel, SpatialErrorModel, SpatialLagModel}
import whu.edu.cn.algorithms.SpatialStats.Utils.FeatureDistance._
import whu.edu.cn.algorithms.SpatialStats.Utils.OtherUtils._
import whu.edu.cn.algorithms.SpatialStats.GWModels.GWRbasic
Expand Down Expand Up @@ -70,8 +68,9 @@ object test {
// interpolationUtils.makeTiff(ras,"src/main/scala/whu/edu/cn/algorithms/SpatialStats/Test/testdata/","kriging")
// selfDefinedKriging(sc,shpfile2,"z",10,10,"Sph",0.1,0.1,0.1)

LinearRegression.fit(sc, shpfile3,y="PURCHASE", x="FLOORSZ,PROF,UNEMPLOY")
// LinearRegression.fit(sc, shpfile3,y="PURCHASE", x="FLOORSZ,PROF,UNEMPLOY")
// LogisticRegression.fit(sc, shpfile3,y="TYPEFLAT", x="FLOORSZ,PROF,UNEMPLOY")
PoissonRegression.fit(sc, shpfile3,y="PURCHASE", x="FLOORSZ,PROF,UNEMPLOY",Intercept = true)
// SpatialLagModel.fit(sc, shpfile, "aging", "PCGDP,GI,FD,education")
// SpatialErrorModel.fit(sc, shpfile, "aging", "PCGDP,GI,FD,education")
// SpatialDurbinModel.fit(sc, shpfile, "aging", "PCGDP,GI,FD,education")
Expand Down