diff --git a/.github/ISSUE_TEMPLATE/0-general_issue.md b/.github/ISSUE_TEMPLATE/0-general_issue.md new file mode 100644 index 0000000..84bb0d7 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/0-general_issue.md @@ -0,0 +1,8 @@ +--- +name: General issue +about: Quickly create a general issue +title: '' +labels: '' +assignees: '' + +--- \ No newline at end of file diff --git a/.github/ISSUE_TEMPLATE/1-bug_report.md b/.github/ISSUE_TEMPLATE/1-bug_report.md new file mode 100644 index 0000000..d4917b5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/1-bug_report.md @@ -0,0 +1,17 @@ +--- +name: Bug report +about: Tell us about a problem to fix +title: 'Short description' +labels: 'bug' +assignees: '' + +--- +**Bug report** + + +**Before submitting** +Please check the following: + +- [ ] I have described the situation in which the bug arose, including what code was executed, information about my environment, and any applicable data others will need to reproduce the problem. +- [ ] I have included available evidence of the unexpected behavior (including error messages, screenshots, and/or plots) as well as a descriprion of what I expected instead. +- [ ] If I have a solution in mind, I have provided an explanation and/or pseudocode and/or task list. diff --git a/.github/ISSUE_TEMPLATE/2-feature_request.md b/.github/ISSUE_TEMPLATE/2-feature_request.md new file mode 100644 index 0000000..50ab544 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/2-feature_request.md @@ -0,0 +1,18 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: 'Short description' +labels: 'enhancement' +assignees: '' + +--- + +** Feature request** + + +**Before submitting** +Please check the following: + +- [ ] I have described the purpose of the suggested change, specifying what I need the enhancement to accomplish, i.e. what problem it solves. +- [ ] I have included any relevant links, screenshots, environment information, and data relevant to implementing the requested feature, as well as pseudocode for how I want to access the new functionality. +- [ ] If I have ideas for how the new feature could be implemented, I have provided explanations and/or pseudocode and/or task lists for the steps. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..87b1e80 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ +## Problem & Solution Description (including issue #) + + +## Code Quality +- [ ] My code follows [the code style of this project](https://rail-hub.readthedocs.io/en/latest/source/contributing.html#naming-conventions) +- [ ] I have written unit tests or justified all instances of `#pragma: no cover`; in the case of a bugfix, a new test that breaks as a result of the bug has been added +- [ ] My code contains relevant comments and necessary documentation for future maintainers; the change is reflected in applicable demos/tutorials (with output cleared!) and added/updated docstrings use the [NumPy docstring format](https://numpydoc.readthedocs.io/en/latest/format.html) +- [ ] Any breaking changes, described above, are accompanied by backwards compatibility and deprecation warnings diff --git a/.github/workflows/add-issue-to-project-tracker.yml b/.github/workflows/add-issue-to-project-tracker.yml new file mode 100644 index 0000000..37daf55 --- /dev/null +++ b/.github/workflows/add-issue-to-project-tracker.yml @@ -0,0 +1,20 @@ +name: Add bugs to bugs project + +on: + issues: + types: + - opened + pull_request: + types: [opened, reopened] + +jobs: + add-to-project: + name: Add issue to project + runs-on: ubuntu-latest + steps: + - uses: actions/add-to-project@v0.4.0 + with: + # You can target a repository in a different organization + # to the issue + project-url: https://github.com/orgs/LSSTDESC/projects/6 + github-token: ${{ secrets.ADD_TO_PROJECT_PAT }} diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml new file mode 100644 index 0000000..f7dc29a --- /dev/null +++ b/.github/workflows/linting.yml @@ -0,0 +1,33 @@ +# This workflow will install Python dependencies, then perform static linting analysis. +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Lint + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Install dependencies + run: | + sudo apt-get update + python -m pip install --upgrade pip + pip install . + pip install .[dev] + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Analyze code with linter + run: | + pylint -rn -sn --recursive=y ./src + pylint -rn -sn --recursive=y ./tests + # the following line allows the CI test to pass, even if pylint fails + continue-on-error: true diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml new file mode 100644 index 0000000..a616f50 --- /dev/null +++ b/.github/workflows/publish-to-pypi.yml @@ -0,0 +1,37 @@ +# This workflow will upload a Python Package using Twine when a release is created +# For more information see: https://github.com/pypa/gh-action-pypi-publish#trusted-publishing + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +name: Upload Python Package + +on: + release: + types: [published] + +permissions: + contents: read + +jobs: + deploy: + + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 \ No newline at end of file diff --git a/.github/workflows/smoke-test.yml b/.github/workflows/smoke-test.yml new file mode 100644 index 0000000..85b75a6 --- /dev/null +++ b/.github/workflows/smoke-test.yml @@ -0,0 +1,36 @@ +# This workflow will run daily at 06:45. +# It will install Python dependencies and run tests with a variety of Python versions. + +name: Unit test smoke test + +on: + schedule: + - cron: 45 6 * * * + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.9', '3.10', '3.11'] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + sudo apt-get update + python -m pip install --upgrade pip + pip install . + pip install .[dev] + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Run unit tests with pytest + run: | + python -m pytest tests diff --git a/.github/workflows/testing-and-coverage.yml b/.github/workflows/testing-and-coverage.yml new file mode 100644 index 0000000..3a44f8f --- /dev/null +++ b/.github/workflows/testing-and-coverage.yml @@ -0,0 +1,39 @@ +# This workflow will install Python dependencies, run tests and report code coverage with a variety of Python versions +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Unit test and code coverage + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['3.9', '3.10', '3.11'] + + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + sudo apt-get update + python -m pip install --upgrade pip + pip install . + pip install .[dev] + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Run unit tests with pytest + run: | + python -m pytest tests --cov=dnf --cov-report=xml + - name: Upload coverage report to codecov + uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/examples/DNF_Demo.ipynb b/examples/DNF_Demo.ipynb new file mode 100644 index 0000000..7564eaf --- /dev/null +++ b/examples/DNF_Demo.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# RAIL's DNF implementation example\n", + "\n", + "authors: Laura Toribio san Cipriano, Sam Schmidt\n", + "last successfully run: Jan 13, 2025\n", + "\n", + "A quick demo of the DNF package in RAIL.\n", + "\n", + "\n", + "[Need to add more about the algorithm and options here!]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "#%matplotlib inline " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import rail\n", + "import qp\n", + "from rail.core.data import TableHandle\n", + "from rail.core.stage import RailStage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "DS = RailStage.data_store\n", + "DS.__class__.allow_overwrite = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training the informer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dnf_dict = dict(zmin=0.0, zmax=3.0, nzbins=301, hdf5_groupname='photometry')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will begin by training the algorithm, to to this we instantiate a rail object with a call to the base class.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.estimation.algos.dnf import DNFInformer, DNFEstimator\n", + "pz_train = DNFInformer.make_stage(name='inform_DNF', model='demo_DNF_model.pkl', **dnf_dict)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's load our training data, which is stored in hdf5 format. We'll load it into the Data Store so that the ceci stages are able to access it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from rail.utils.path_utils import RAILDIR\n", + "trainFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_training_9816.hdf5')\n", + "testFile = os.path.join(RAILDIR, 'rail/examples_data/testdata/test_dc2_validation_9816.hdf5')\n", + "training_data = DS.read_file(\"training_data\", TableHandle, trainFile)\n", + "test_data = DS.read_file(\"test_data\", TableHandle, testFile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The inform stage for CMNN should not take long to run, it essentially just converts the magnitudes to colors for the training data and stores those as a model dictionary which is stored in a pickle file specfied by the `model` keyword above, in this case \"demo_cmnn_model.pkl\". This file should appear in the directory after we run the inform stage in the cell below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "pz_train.inform(training_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can now set up the main photo-z stage and run our algorithm on the data to produce simple photo-z estimates. Note that we are loading the trained model that we computed from the inform stage: with the `model=pz_train.get_handle('model')` statement. We will set `nondetect_replace` to `True` to replace our non-detection magnitudes with their 1-sigma limits and use all colors.
\n", + "\n", + "DNF allows three different methods for choosing the distance metric: Euclidean (\"ENF\" set with `selection_mode` of `0`), Angular (\"ANF\" set with selection_mode of `1`, this is the default for the stage), and Directional (\"DNF\" set with `selection_mode` of `2`).\n", + "\n", + "In our first example, let's set the `selection_mode` to \"1\", which will use the angular distance:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "pz = DNFEstimator.make_stage(name='DNF_estimate', hdf5_groupname='photometry',\n", + " model=pz_train.get_handle('model'),\n", + " selection_mode=1,\n", + " min_n=15,\n", + " bad_redshift_val=99.,\n", + " bad_redshift_err=10.,\n", + " nondetect_replace=True)\n", + "results = pz.estimate(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DNF calculates its own point estimate, `DNF_Z`, which is stored in the qp Ensemble `ancil` data. Let's plot that versus the true redshift. We can also compute the PDF mode for each object and plot that as well:\n", + "\n", + "# NOTE: we should probably say what the DNF_Z is exactly, is it mean of the PDF?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#zmode = results().ancil['zmode']\n", + "zdnf = results().ancil['DNF_Z'].flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zgrid = np.linspace(0,3,301)\n", + "zmode = results().mode(zgrid).flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zmode" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's plot the redshift mode against the true redshifts to see how they look:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'],zmode,s=1,c='k',label='DNF mode')\n", + "plt.plot([0,3],[0,3],'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"DNF photo-z mode\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'], zdnf, s=1, c='k')\n", + "plt.plot([0,3],[0,3], 'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"DNF_Z\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## plotting PDFs\n", + "\n", + "In addition to point estimates, we can also plot a few of the full PDFs produced by DNF using the `plot_native` method of the qp Ensemble that we've created as `results`. We can specify which PDF to plot with the `key` argument to `plot_native`, let's plot four, the 5th, 1380th, 14481st, and 18871st:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axs = plt.subplots(2, 2, figsize=(12,8))\n", + "whichgals = [4, 1379, 14480, 18870]\n", + "for ax, which in zip(axs.flat, whichgals):\n", + " ax.set_xlim(0,3)\n", + " results().plot_native(key=which, axes=ax)\n", + " ax.set_xlabel(\"redshift\")\n", + " ax.set_ylabel(\"p(z)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see quite a bit of structure in the DNF PDFs, with lots of discrete peaks. \n", + "\n", + "\n", + "## NOTE: we should probably add more about DNF and why the PDFs look this way with all of the little bumps!!!!!\n", + "\n", + "\n", + "# Other distance metrics\n", + "\n", + "Besides \"DNF\" there are options for \"ENF\" and \"ANF\" (I don't know what the differences are, looks to be the metric used, and \"direction\", \"euclidean\", and \"angular\". We would have to read the paper for details), try these out.\n", + "\n", + "Let's run our estimator using `selection_mode=0` for the Euclidean distance, and compare both the mode results and PDF results:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "pz2 = DNFEstimator.make_stage(name='DNF_estimate2', hdf5_groupname='photometry',\n", + " model=pz_train.get_handle('model'),\n", + " selection_mode=0,\n", + " min_n=15,\n", + " bad_redshift_val=99.,\n", + " bad_redshift_err=10.,\n", + " nondetect_replace=True)\n", + "results2 = pz2.estimate(test_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zdnf2 = results2().ancil['DNF_Z'].flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "zgrid = np.linspace(0,3,301)\n", + "zmode2 = results2().mode(zgrid).flatten()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we can plot the DNF and mode plots on their own, they look very similar, but not exactly the same, as the \"angular\" distance estimates:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'],zmode2,s=1,c='k',label='DNF mode')\n", + "plt.plot([0,3],[0,3],'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"DNF photo-z mode\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'], zdnf2, s=1, c='k')\n", + "plt.plot([0,3],[0,3], 'r--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"DNF_Z\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's directly compare the \"angular\" and \"Euclidean\" distance estimates on the same axes:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'], zdnf, s=2, c='k', label=\"angular\")\n", + "plt.scatter(test_data()['photometry']['redshift'], zdnf2, s=1, c='r', label=\"Euclidean\")\n", + "plt.legend(loc='upper left', fontsize=10)\n", + "plt.plot([0,3],[0,3], 'm--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"DNF_Z\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(8,8))\n", + "plt.scatter(test_data()['photometry']['redshift'], zmode, s=2, c='k')\n", + "plt.scatter(test_data()['photometry']['redshift'], zmode2, s=1, c='r')\n", + "plt.plot([0,3],[0,3], 'm--');\n", + "plt.xlabel(\"true redshift\")\n", + "plt.ylabel(\"DNF_Z\")\n", + "plt.ylim(0,3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, let's directly compare the same PDFs that we plotted above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axs = plt.subplots(2, 2, figsize=(12,8))\n", + "whichgals = [4, 1379, 14480, 18870]\n", + "for ax, which in zip(axs.flat, whichgals):\n", + " ax.set_xlim(0,3)\n", + " results().plot_native(key=which, axes=ax, label=\"angular\")\n", + " results2().plot_native(key=which, axes=ax, label=\"Euclidean\")\n", + " ax.set_xlabel(\"redshift\")\n", + " ax.set_ylabel(\"p(z)\")\n", + "ax.legend(loc='upper left', fontsize=12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to the mode and DNF point estimates, we see that the PDFs look similar, but not exactly the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.16" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 3ce3b6d..67efc32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,11 @@ classifiers = [ ] dynamic = ["version"] -dependencies = [] +dependencies = [ + "pz-rail-base>1.0.3", + "qp-prob[full]", + "scikit-learn", + ] # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes) [project.optional-dependencies] @@ -35,7 +39,7 @@ requires = [ build-backend = "setuptools.build_meta" [tool.setuptools_scm] -write_to = "src/rail/example_package/_version.py" +write_to = "src/rail/dnf/_version.py" [tool.pytest.ini_options] testpaths = [ diff --git a/src/rail/estimation/algos/dnf.py b/src/rail/estimation/algos/dnf.py index c0a264c..259f903 100644 --- a/src/rail/estimation/algos/dnf.py +++ b/src/rail/estimation/algos/dnf.py @@ -4,15 +4,15 @@ for more details """ -import math +# import math import numpy as np import qp from sklearn import neighbors -from scipy.stats import chi2 +# from scipy.stats import chi2 from ceci.config import StageParameter as Param from rail.estimation.estimator import CatEstimator, CatInformer from rail.core.common_params import SHARED_PARAMS -from sklearn import neighbors + def _computemagdata(data, column_names, err_names): """ @@ -22,41 +22,43 @@ def _computemagdata(data, column_names, err_names): numerrcols = len(err_names) if numcols != numerrcols: # pragma: no cover raise ValueError("number of magnitude and error columns must be the same!") - + magdata = np.array(data[column_names[0]]) errdata = np.array(data[err_names[0]]) - + # Iteramos desde el segundo elemento for i in range(1, numcols): tmpmag = np.array(data[column_names[i]]) tmperr = np.array(data[err_names[i]]) - + magdata = np.vstack((magdata, tmpmag)) errdata = np.vstack((errdata, tmperr)) - + return magdata.T, errdata.T - + + class DNFInformer(CatInformer): """Descripcion of the funcion *** """ name = 'DNFInformer' config_options = CatInformer.config_options.copy() - config_options.update(bands=SHARED_PARAMS, - err_bands=SHARED_PARAMS, + config_options.update(bands=SHARED_PARAMS, + err_bands=SHARED_PARAMS, redshift_col=SHARED_PARAMS, mag_limits=SHARED_PARAMS, - nondetect_val=SHARED_PARAMS) + nondetect_val=SHARED_PARAMS, + hdf5_groupname=SHARED_PARAMS) - def __init__(self, args, comm=None): + def __init__(self, args, **kwargs): """ Constructor - Do CatInformer specific initialization, then check on bands """ - CatInformer.__init__(self, args, comm=comm) - + Do CatInformer specific initialization, then check on bands """ + super().__init__(args, **kwargs) + def run(self): if self.config.hdf5_groupname: training_data = self.get_data('input')[self.config.hdf5_groupname] else: - training_data = self.get_data('input') + training_data = self.get_data('input') # pragma: no cover specz = np.array(training_data[self.config['redshift_col']]) # replace nondetects @@ -64,36 +66,36 @@ def run(self): if np.isnan(self.config.nondetect_val): # pragma: no cover mask = np.isnan(training_data[col]) else: - mask = np.isclose(training_data[col], self.config.nondetect_val) - + mask = np.isclose(training_data[col], self.config.nondetect_val) + training_data[col][mask] = self.config.mag_limits[col] - training_data[err][mask] = 1.0 # could also put 0.757 for 1 sigma, but slightly inflated seems good - + training_data[err][mask] = 1.0 # could also put 0.757 for 1 sigma, but slightly inflated seems good + mag_data, mag_err = _computemagdata(training_data, - self.config.bands, - self.config.err_bands) - + self.config.bands, + self.config.err_bands) + # Training euclidean metric clf = neighbors.KNeighborsRegressor() clf.fit(mag_data, specz) - + # Training variables - Tnorm = np.linalg.norm(mag_data, axis=1) + Tnorm = np.linalg.norm(mag_data, axis=1) + + self.model = dict(train_mag=mag_data, train_err=mag_err, truez=specz, clf=clf, train_norm=Tnorm) + self.add_data('model', self.model) - self.model = dict(train_mag=mag_data, train_err=mag_err, truez=specz, clf=clf, train_norm = Tnorm - self.add_data('model', self.model) - class DNFEstimator(CatEstimator): """ Aqui habra que escribir una descripcion de la funcion""" - + name = 'DNFEstimator' config_options = CatEstimator.config_options.copy() config_options.update(zmin=SHARED_PARAMS, zmax=SHARED_PARAMS, nzbins=SHARED_PARAMS, bands=SHARED_PARAMS, - err_bands=SHARED_PARAMS, + err_bands=SHARED_PARAMS, nondetect_val=SHARED_PARAMS, mag_limits=SHARED_PARAMS, redshift_col=SHARED_PARAMS, @@ -102,26 +104,38 @@ class DNFEstimator(CatEstimator): # seed=Param(int, 66, msg="random seed used in selection mode"), # ppf_value=Param(float, 0.68, msg="PPF value used in Mahalanobis distance"), selection_mode=Param(int, 1, msg="select which mode to choose the redshift estimate:" - "0: ENF, 1: ANF, 2: DNF"), # Habra que detallar mas que significa cada caso - min_n=Param(int, 25, msg="minimum number of training galaxies to use"), # ¿Necesitamos un minimo de galaxas? + "0: ENF, 1: ANF, 2: DNF"), # Habra que detallar mas que significa cada caso + min_n=Param(int, 25, msg="minimum number of training galaxies to use"), # ¿Necesitamos un minimo de galaxas? # min_thresh=Param(float, 0.0001, msg="minimum threshold cutoff"), # Esto no tengo claro para que es - # min_dist=Param(float, 0.0001, msg="minimum Mahalanobis distance"), # Esto supongo que lo podemos eliminar + # min_dist=Param(float, 0.0001, msg="minimum Mahalanobis distance"), # Esto supongo que lo podemos eliminar bad_redshift_val=Param(float, 99., msg="redshift to assign bad redshifts"), - bad_redshift_err=Param(float, 10., msg="Gauss error width to assign to bad redshifts") # Habria que cambiar el msg ¿? + bad_redshift_err=Param(float, 10., msg="Gauss error width to assign to bad redshifts") # Habria que cambiar el msg ¿? ) - - def __init__(self, args, comm=None): + def __init__(self, args, **kwargs): """ Constructor: - Do Estimator specific initialization + Do Estimator specific initialization """ self.truezs = None - self.model = None # Asegurar este parametro y su necesidad + self.model = None # Asegurar este parametro y su necesidad self.zgrid = None - CatEstimator.__init__(self, args, comm=comm) + self.metric = "ANF" + super().__init__(args, **kwargs) usecols = self.config.bands.copy() usecols.append(self.config.redshift_col) self.usecols = usecols + # set up selection mode metric choice + if self.config.selection_mode == 0: + self.metric = "ENF" + print("using metric ENF") + elif self.config.selection_mode == 1: + self.metric = "ANF" + print("using metric ANF") + elif self.config.selection_mode == 2: + self.metric = "DNF" + print("using metric DNF") + else: + raise ValueError("invalid value for config parameter selection_mode! Valid values are 0, 1, and 2") def open_model(self, **kwargs): CatEstimator.open_model(self, **kwargs) @@ -131,10 +145,10 @@ def open_model(self, **kwargs): self.train_err = self.model['train_err'] self.truez = self.model['truez'] self.clf = self.model['clf'] - self.Tnorm = self.model['train_norm'] - - def _process_chunk(self, start, end, data, first): # hay que dejar este nombre porque es el que ponen todos - + self.Tnorm = self.model['train_norm'] + + def _process_chunk(self, start, end, data, first): # hay que dejar este nombre porque es el que ponen todos + print(f"Process {self.rank} estimating PZ PDF for rows {start:,} - {end:,}") # replace nondetects for col, err in zip(self.config.bands, self.config.err_bands): @@ -144,435 +158,407 @@ def _process_chunk(self, start, end, data, first): # hay que dejar este nombre p mask = np.isclose(data[col], self.config.nondetect_val) data[col][mask] = self.config.mag_limits[col] - data[err][mask] = 1.0 # could also put 0.757 for 1 sigma, but slightly inflated seems good - + data[err][mask] = 1.0 # could also put 0.757 for 1 sigma, but slightly inflated seems good + test_mag, test_mag_err = _computemagdata(data, - self.config.bands, - self.config.err_bands) + self.config.bands, + self.config.err_bands) num_gals = test_mag.shape[0] ncols = test_mag.shape[1] - + self.zgrid = np.linspace(self.config.zmin, self.config.zmax, self.config.nzbins) - + photoz, photozerr, photozerr_param, photozerr_fit, z1, nneighbors, de1, d1, id1, C, pdfs = \ dnf_photometric_redshift( - self.train_mag, - self.train_err, - self.truez, - self.clf, - self.Tnorm, - test_mag, - test_mag_err, - self.zgrid, - metric='ANF', - fit=True, - pdf=True, - Nneighbors=80, + self.train_mag, + self.train_err, + self.truez, + self.clf, + self.Tnorm, + test_mag, + test_mag_err, + self.zgrid, + metric=self.metric, + fit=True, + pdf=True, + Nneighbors=80, presel=500 ) ancil_dictionary = dict() qp_dnf = qp.Ensemble(qp.interp, data=dict(xvals=self.zgrid, yvals=pdfs)) - - ancil_dictionary.update(DNF_Z=photoz,photozerr=photozerr, - photozerr_param=photozerr_param,photozerr_fit=photozerr_fit,DNF_ZN=z1, - nneighbors=nneighbors,de1=de1,d1=d1,id1=id1) #, C=C, Vpdf=Vpdf + ancil_dictionary.update(DNF_Z=photoz, photozerr=photozerr, + photozerr_param=photozerr_param, photozerr_fit=photozerr_fit, DNF_ZN=z1, + nneighbors=nneighbors, de1=de1, d1=d1, id1=id1) # , C=C, Vpdf=Vpdf qp_dnf.set_ancil(ancil_dictionary) - + self._do_chunk_output(qp_dnf, start, end, first) def dnf_photometric_redshift(T, Terr, z, clf, Tnorm, V, Verr, zgrid, metric='ANF', fit=False, pdf=True, Nneighbors=80, presel=500): - """ - Compute the photometric redshifts for the validation or science sample. - - Returns - ------- - - photoz: Estimated photometric redshift. - - photozerr: Error on the photometric redshift. - - photozerr_param: Redshift error due to parameters. - - photozerr_fit:Redshift error due to fit. - - z1: Closest redshift estimate. - - nneighbors: Number of neighbors considered. - - de1: Distances Euclidea to the closest neighbor. - - d1: Distances to the closest neighbor. - - id1: Index of the closest neighbor. - - C: Additional computed parameters. - - zpdf: Matrix containing the redshifts of neighboring galaxies. - - wpdf: Matrix of weights corresponding to the neighboring redshifts. - - Vpdf: Probability Density Functions (PDFs) for the photometric redshifts of the validation set. - """ - - C=0 #coefficients by default - - # Step 1: Preselection - NEIGHBORS, Ts, Tsnorm, de1, Nvalid = preselection(V, Verr, Nneighbors, presel, T, clf, Tnorm, z) - - # Step 2: Metric computation - NEIGHBORS = metric_computation(V, NEIGHBORS, Ts, Tsnorm, metric, Nneighbors) - - # Step 3: Compute mean redshift removing outliers - photoz, photozerr, photozerr_param, photozerr_fit, z1, d1, id1, nneighbors, Vpdf, NEIGHBORS = compute_photoz_mean_routliers(NEIGHBORS, Verr, pdf, Nvalid, zgrid) - - # Step 4: Optional fitting of redshifts - if fit: - photoz, photozerr, photozerr_param, photozerr_fit, nneighbors, C = compute_photoz_fit( - NEIGHBORS, - V, - Verr, - T, - z, - fit, - photoz, - photozerr, - photozerr_param, - photozerr_fit - ) - - - # Return - return ( + """ + Compute the photometric redshifts for the validation or science sample. + + Returns + ------- + - photoz: Estimated photometric redshift. + - photozerr: Error on the photometric redshift. + - photozerr_param: Redshift error due to parameters. + - photozerr_fit:Redshift error due to fit. + - z1: Closest redshift estimate. + - nneighbors: Number of neighbors considered. + - de1: Distances Euclidea to the closest neighbor. + - d1: Distances to the closest neighbor. + - id1: Index of the closest neighbor. + - C: Additional computed parameters. + - zpdf: Matrix containing the redshifts of neighboring galaxies. + - wpdf: Matrix of weights corresponding to the neighboring redshifts. + - Vpdf: Probability Density Functions (PDFs) for the photometric redshifts of the validation set. + """ + + C = 0 # coefficients by default + + # Step 1: Preselection + NEIGHBORS, Ts, Tsnorm, de1, Nvalid = preselection(V, Verr, Nneighbors, presel, T, clf, Tnorm, z) + + # Step 2: Metric computation + NEIGHBORS = metric_computation(V, NEIGHBORS, Ts, Tsnorm, metric, Nneighbors) + + # Step 3: Compute mean redshift removing outliers + photoz, photozerr, photozerr_param, photozerr_fit, z1, d1, id1, nneighbors, Vpdf, NEIGHBORS = compute_photoz_mean_routliers(NEIGHBORS, Verr, pdf, Nvalid, zgrid) + + # Step 4: Optional fitting of redshifts + if fit: + photoz, photozerr, photozerr_param, photozerr_fit, nneighbors, C = compute_photoz_fit( + NEIGHBORS, + V, + Verr, + T, + z, + fit, photoz, photozerr, photozerr_param, - photozerr_fit, - z1, - nneighbors, - de1, - d1, - id1, - C, - Vpdf, - ) + photozerr_fit + ) -def validate_columns(V, T): - """ - Validates that the columns of T and V have the same names. - - Parameters - ---------- - T : np.ndarray - Training data. - V : np.ndarray - Validation data. - - Raises - ------ - ValueError - If the column names of T and V do not match. - """ - if not np.array_equal(T.dtype.names, V.dtype.names): - raise ValueError("The columns of T and V do not match. Please ensure that both T and V have the same features.") + # Return + return ( + photoz, + photozerr, + photozerr_param, + photozerr_fit, + z1, + nneighbors, + de1, + d1, + id1, + C, + Vpdf, + ) +def validate_columns(V, T): + """ + Validates that the columns of T and V have the same names. + + Parameters + ---------- + T : np.ndarray + Training data. + V : np.ndarray + Validation data. + + Raises + ------ + ValueError + If the column names of T and V do not match. + """ + if not np.array_equal(T.dtype.names, V.dtype.names): # pragma: no cover + raise ValueError("The columns of T and V do not match. Please ensure that both T and V have the same features.") + def preselection(V, Verr, Nneighbors, presel, T, clf, Tnorm, z): - """ - Perform the preselection process for photometric redshift estimation. - """ - # Ensure V is set - if V is None: - raise ValueError("Validation data 'V' is not set. Ensure it is initialized.") - - # Size training - Ntrain = T.shape[0] - Nneighbors_presel = presel if Ntrain > presel else Ntrain - - # Validation variables - validate_columns(V, T) - - - Nvalid=V.shape[0] - nneighbors = np.full(Nvalid, Nneighbors, dtype='int') - - - # Compute distances and indices for preselected neighbors - Ndistances, Nindices = clf.kneighbors(V, n_neighbors=Nneighbors_presel) - - # Handle cases where distance is zero - mask = Ndistances[:, 0] == 0 - Ndistances[mask] = np.roll(Ndistances[mask], -1, axis=1) - Nindices[mask] = np.roll(Nindices[mask], -1, axis=1) - Ndistances[mask, -1] = Ndistances[mask, 0] - Nindices[mask, -1] = Nindices[mask, 0] - - # Store distances and indices - de1 = Ndistances[:, 0] - #d1 = Ndistances[:, 0] - #Vclosest = Nindices[:, 0] - #id1 = Vclosest - - # Initialize NEIGHBORS array to store indices, distances, and redshifts - NEIGHBORS = np.zeros( - (Nvalid, Nneighbors_presel), - dtype=[('index', 'i4'), ('distance', 'f8'), ('z', 'f8')] - ) - NEIGHBORS['index'] = Nindices - NEIGHBORS['distance'] = Ndistances - NEIGHBORS['z'] = z[Nindices] + """ + Perform the preselection process for photometric redshift estimation. + """ + # Ensure V is set + if V is None: # pragma: no cover + raise ValueError("Validation data 'V' is not set. Ensure it is initialized.") + + # Size training + Ntrain = T.shape[0] + Nneighbors_presel = presel if Ntrain > presel else Ntrain + + # Validation variables + validate_columns(V, T) + + Nvalid = V.shape[0] + nneighbors = np.full(Nvalid, Nneighbors, dtype='int') + + # Compute distances and indices for preselected neighbors + Ndistances, Nindices = clf.kneighbors(V, n_neighbors=Nneighbors_presel) + + # Handle cases where distance is zero + mask = Ndistances[:, 0] == 0 + Ndistances[mask] = np.roll(Ndistances[mask], -1, axis=1) + Nindices[mask] = np.roll(Nindices[mask], -1, axis=1) + Ndistances[mask, -1] = Ndistances[mask, 0] + Nindices[mask, -1] = Nindices[mask, 0] + + # Store distances and indices + de1 = Ndistances[:, 0] + # d1 = Ndistances[:, 0] + # Vclosest = Nindices[:, 0] + # id1 = Vclosest + + # Initialize NEIGHBORS array to store indices, distances, and redshifts + NEIGHBORS = np.zeros( + (Nvalid, Nneighbors_presel), + dtype=[('index', 'i4'), ('distance', 'f8'), ('z', 'f8')] + ) + NEIGHBORS['index'] = Nindices + NEIGHBORS['distance'] = Ndistances + NEIGHBORS['z'] = z[Nindices] + + # Extract training preselection data + Ts = T[Nindices] + Tsnorm = Tnorm[Nindices] - # Extract training preselection data - Ts = T[Nindices] - Tsnorm = Tnorm[Nindices] + return NEIGHBORS, Ts, Tsnorm, de1, Nvalid - return NEIGHBORS, Ts, Tsnorm, de1, Nvalid def metric_computation(V, NEIGHBORS, Ts, Tsnorm, metric, Nneighbors): - """ - Compute distances based on the selected metric, sort neighbors, and store the closest neighbors. - """ - # Compute distances based on the chosen metric - if metric == 'ENF': - NEIGHBORS['distance'] = compute_euclidean_distance(V, Ts) - elif metric == 'ANF': - NEIGHBORS['distance'] = compute_angular_distance(V, Ts, Tsnorm) - elif metric == 'DNF': - NEIGHBORS['distance'] = compute_directional_distance(V, Ts, Tsnorm) - - # Sort the top Nneighbors - NEIGHBORS = np.sort(NEIGHBORS, order='distance', axis=1) - - # Select redshift of closer neighbors until n numbers of neighbors - NEIGHBORS=NEIGHBORS[:,:Nneighbors] - - return NEIGHBORS + """ + Compute distances based on the selected metric, sort neighbors, and store the closest neighbors. + """ + # Compute distances based on the chosen metric + if metric == 'ENF': + NEIGHBORS['distance'] = compute_euclidean_distance(V, Ts) + elif metric == 'ANF': + NEIGHBORS['distance'] = compute_angular_distance(V, Ts, Tsnorm) + elif metric == 'DNF': + NEIGHBORS['distance'] = compute_directional_distance(V, Ts, Tsnorm) + + # Sort the top Nneighbors + NEIGHBORS = np.sort(NEIGHBORS, order='distance', axis=1) + + # Select redshift of closer neighbors until n numbers of neighbors + NEIGHBORS = NEIGHBORS[:, :Nneighbors] + + return NEIGHBORS def compute_euclidean_distance(V, Ts): - """ - Compute distances based on Euclidean metric. - """ - - D = V[:, np.newaxis,:] - Ts - Dsquare = D[:] * D[:] - D2 = np.sum(Dsquare[:], axis=2) - d = np.sqrt(D2) - return d + """ + Compute distances based on Euclidean metric. + """ + + D = V[:, np.newaxis, :] - Ts + Dsquare = D[:] * D[:] + D2 = np.sum(Dsquare[:], axis=2) + d = np.sqrt(D2) + return d def compute_angular_distance(V, Ts, Tsnorm): - """ - Compute distances based on angular (ANF) metric. - """ - - Vnorm = np.linalg.norm(V, axis=1) #[:,np.newaxis]) - #Vnorm2 = np.sum(V**2, axis=1) #[:,np.newaxis]) - pescalar = np.sum(V[:, np.newaxis,:] * Ts, axis=2) # np.inner(self.V[:], Ts[:,:]) - normalization = Vnorm[:, np.newaxis] * Tsnorm - NIP = pescalar / normalization - alpha = np.sqrt(1.0 - NIP**2) - return alpha - - + """ + Compute distances based on angular (ANF) metric. + """ + + Vnorm = np.linalg.norm(V, axis=1) # [:,np.newaxis]) + # Vnorm2 = np.sum(V**2, axis=1) #[:,np.newaxis]) + pescalar = np.sum(V[:, np.newaxis, :] * Ts, axis=2) # np.inner(self.V[:], Ts[:,:]) + normalization = Vnorm[:, np.newaxis] * Tsnorm + NIP = pescalar / normalization + alpha = np.sqrt(1.0 - NIP**2) + return alpha + + def compute_directional_distance(V, Ts, Tsnorm): - """ - Compute distances based on directional (DNF) metric. - """ - - d1 = compute_euclidean_distance(V, Ts) - d2 = compute_angular_distance(V, Ts, Tsnorm) - d = d1 * d2 - return d - - + """ + Compute distances based on directional (DNF) metric. + """ + + d1 = compute_euclidean_distance(V, Ts) + d2 = compute_angular_distance(V, Ts, Tsnorm) + d = d1 * d2 + return d + + def compute_photoz_mean_routliers(NEIGHBORS, Verr, pdf, Nvalid, zgrid): - """ - Compute the mean photometric redshift removing outliers - """ - - # Extract distances and redshifts from neighbors - distances = NEIGHBORS['distance'] - zmatrix = NEIGHBORS['z'] - indices = NEIGHBORS['index'] - - # Store nearest redshift, distance and index - z1= zmatrix[:,0] - d1 = distances[:, 0] - id1 = indices[:,0] - - # --- Outlier Detection and Weighting --- - # Calculate mean distance for each sample - median_absolute_deviation = distances.mean(axis=1) - # Define the threshold for outlier detection - threshold = median_absolute_deviation #Adjust multiplier if needed (e.g., *2) - - # Create a mask for non-outlier distances - outliers_weights = distances < threshold[:, None] - - # Update the number of valid neighbors per sample - nneighbors = np.sum(outliers_weights, axis=1) - cutNneighbors = np.max(nneighbors) # Maximum number of valid neighbors - - # --- Distance Weighting --- - # Compute inverse distances for weighting - inverse_distances = 1.0 / distances - - # Apply the outlier weigh to inverse distances and distances - inverse_distances = inverse_distances * outliers_weights - distances = distances * outliers_weights - - # Normalize weights by the sum of inverse distances per sample - row_sum = inverse_distances.sum(axis=1, keepdims=True) - wmatrix = inverse_distances / row_sum - wmatrix = np.nan_to_num(wmatrix) # Handle potential NaN values from division by zero - - # --- Photometric Redshift Computation --- - # Compute the weighted mean redshift for each sample - photozmean = np.sum(zmatrix * wmatrix, axis=1) - photoz = photozmean - - # --- Redshift Error Computation --- - # Compute the standard deviation of redshifts (fit error) - zt = photozmean[:, np.newaxis] #column vector - zerrmatrix = (zmatrix - zt) ** 2 - - # Compute the error based in the fit and the parameters - Verrnorm = np.linalg.norm(Verr, axis=1) - photozerr_fit = 1.758 *np.sqrt(np.sum(zerrmatrix * wmatrix, axis=1)) - photozerr_param = np.abs(0.2 * Verrnorm * (1.0 + photoz)) - - # Combine errors to calculate the total redshift error - photozerr = np.sqrt(photozerr_param**2 + photozerr_fit**2) - - # --- PDF Computation Setup --- - # Select the top Nneighbors redshifts and weights for PDF computation - zpdf = zmatrix[:, :cutNneighbors] - wpdf = wmatrix[:, :cutNneighbors] - - # Update NEIGHBORS array to include only the top Nneighbors - NEIGHBORS = NEIGHBORS[:, :cutNneighbors] - - if pdf: - Vpdf = compute_pdfs(zpdf, wpdf, pdf, Nvalid, zgrid) - else: - Vpf = None + """ + Compute the mean photometric redshift removing outliers + """ - return photoz, photozerr, photozerr_param, photozerr_fit, z1, d1, id1, nneighbors, Vpdf, NEIGHBORS + # Extract distances and redshifts from neighbors + distances = NEIGHBORS['distance'] + zmatrix = NEIGHBORS['z'] + indices = NEIGHBORS['index'] + # Store nearest redshift, distance and index + z1 = zmatrix[:, 0] + d1 = distances[:, 0] + id1 = indices[:, 0] -def compute_photoz_fit(NEIGHBORS, V, Verr, T, z, fit, photoz, photozerr, photozerr_param, photozerr_fit): - """ - Compute the photometric redshift fit by iteratively removing outliers. - """ - # Initialize output parameters - Nvalid = V.shape[0] - nfilters = V.shape[1] - Ntrain = T.shape[0] - - fitIterations = 4 - badtag = 99 - - photozfit = badtag * np.zeros(Nvalid, dtype='double') - nneighbors=np.zeros(Nvalid,dtype='double') - - if fit: - C = np.zeros((Nvalid, nfilters + 1), dtype='double') - else: - C = 0 - - # Increase dimensionality of validation and training data for offsets in fit - Te = np.hstack([T, np.ones((Ntrain, 1), dtype='double')]) - Ve = np.hstack([V, np.ones((Nvalid, 1), dtype='double')]) - - # Loop over all validation samples - for i in range(0, Nvalid): - NEIGHBORSs = NEIGHBORS[i] # Get neighbors for the current sample - nneighbors[i]=len(NEIGHBORSs) - # Perform iterative fitting - for h in range(0, fitIterations): - # Build the design matrix (A) and target vector (B) for the neighbors - A = Te[NEIGHBORSs['index']] - B = z[NEIGHBORSs['index']] - - # Solve the least squares problem - X = np.linalg.lstsq(A, B) - residuals = B - np.dot(A, X[0]) # Compute residuals - - # Identify outliers using a 3-sigma threshold - abs_residuals = np.abs(residuals) - sigma3 = 3.0 * np.mean(abs_residuals) - selection = (abs_residuals < sigma3) - - # Update the number of selected neighbors - nsel = np.sum(selection) - nneighbors[i] = nsel - - # If enough neighbors remain, update NEIGHBORSs; otherwise, stop iteration - if nsel > 10: - NEIGHBORSs = NEIGHBORSs[selection] - else: - break - - # Save the solution vector - C[i] = X[0] - - # Compute the photometric redshift fit for the current sample - photozfit[i] = np.inner(X[0], Ve[i]) - nneighbors[i]= NEIGHBORSs.shape[0] - # Calculate error metrics - if X[1].size != 0: # Check if residuals from the least squares solution exist - # Compute the error based in parameters - param_error = np.abs(X[0][:-1]) * Verr[i] - photozerr_param[i] = np.sqrt(np.sum(param_error**2)) #,axis=1)) - - # Compute the error based in fit - photozerr_fit[i] = np.sqrt(X[1] / nneighbors[i]) - - # Combine errors to get total redshfit error - photozerr[i] = np.sqrt(photozerr_param[i]**2 + photozerr_fit[i]**2) - - # Assign the computed redshift fits to the output variable - photoz = photozfit - - return photoz, photozerr, photozerr_param, photozerr_fit, nneighbors, C - -def compute_pdfs(zpdf, wpdf, pdf, Nvalid, zgrid): - """ - Compute the pdfs from neighbor redshifts and weights. - """ - - Nvalid = zpdf.shape[0] - - # Initialize PDF-related variables if required - if pdf: - Vpdf = np.zeros((Nvalid, len(zgrid)), dtype='double') - else: - Vpdf = 0 - - bin_indices = np.digitize(zpdf, zgrid) # Indices de los bins para cada elemento de zpdf (l x m) - - # Mask matrix for each bin (l x m x len(bins)) - masks = (bin_indices[..., np.newaxis] == np.arange(len(zgrid))) - - # mask per weights and sum by columns - histograms = np.einsum('ij,ijk->ik', wpdf, masks) - Vpdf=histograms - - return Vpdf - - - -def greetings() -> str: - """A friendly greeting for a future friend. + # --- Outlier Detection and Weighting --- + # Calculate mean distance for each sample + median_absolute_deviation = distances.mean(axis=1) + # Define the threshold for outlier detection + threshold = median_absolute_deviation # Adjust multiplier if needed (e.g., *2) - Returns - ------- - str - A typical greeting from a software engineer. - """ - return "Hello from LINCC-Frameworks!" + # Create a mask for non-outlier distances + outliers_weights = distances < threshold[:, None] + # Update the number of valid neighbors per sample + nneighbors = np.sum(outliers_weights, axis=1) + cutNneighbors = np.max(nneighbors) # Maximum number of valid neighbors -def meaning() -> int: - """The meaning of life, the universe, and everything. + # --- Distance Weighting --- + # Compute inverse distances for weighting + inverse_distances = 1.0 / distances - Returns - ------- - int - The meaning of life. + # Apply the outlier weigh to inverse distances and distances + inverse_distances = inverse_distances * outliers_weights + distances = distances * outliers_weights + + # Normalize weights by the sum of inverse distances per sample + row_sum = inverse_distances.sum(axis=1, keepdims=True) + wmatrix = inverse_distances / row_sum + wmatrix = np.nan_to_num(wmatrix) # Handle potential NaN values from division by zero + + # --- Photometric Redshift Computation --- + # Compute the weighted mean redshift for each sample + photozmean = np.sum(zmatrix * wmatrix, axis=1) + photoz = photozmean + + # --- Redshift Error Computation --- + # Compute the standard deviation of redshifts (fit error) + zt = photozmean[:, np.newaxis] # column vector + zerrmatrix = (zmatrix - zt) ** 2 + + # Compute the error based in the fit and the parameters + Verrnorm = np.linalg.norm(Verr, axis=1) + photozerr_fit = 1.758 * np.sqrt(np.sum(zerrmatrix * wmatrix, axis=1)) + photozerr_param = np.abs(0.2 * Verrnorm * (1.0 + photoz)) + + # Combine errors to calculate the total redshift error + photozerr = np.sqrt(photozerr_param**2 + photozerr_fit**2) + + # --- PDF Computation Setup --- + # Select the top Nneighbors redshifts and weights for PDF computation + zpdf = zmatrix[:, :cutNneighbors] + wpdf = wmatrix[:, :cutNneighbors] + + # Update NEIGHBORS array to include only the top Nneighbors + NEIGHBORS = NEIGHBORS[:, :cutNneighbors] + + if pdf: + Vpdf = compute_pdfs(zpdf, wpdf, pdf, Nvalid, zgrid) + else: # pragma: no cover + Vpdf = None + + return photoz, photozerr, photozerr_param, photozerr_fit, z1, d1, id1, nneighbors, Vpdf, NEIGHBORS + + +def compute_photoz_fit(NEIGHBORS, V, Verr, T, z, fit, photoz, photozerr, photozerr_param, photozerr_fit): + """ + Compute the photometric redshift fit by iteratively removing outliers. + """ + # Initialize output parameters + Nvalid = V.shape[0] + nfilters = V.shape[1] + Ntrain = T.shape[0] + + fitIterations = 4 + badtag = 99 + + photozfit = badtag * np.zeros(Nvalid, dtype='double') + nneighbors = np.zeros(Nvalid, dtype='double') + + if fit: + C = np.zeros((Nvalid, nfilters + 1), dtype='double') + else: # pragma: no cover + C = 0 + + # Increase dimensionality of validation and training data for offsets in fit + Te = np.hstack([T, np.ones((Ntrain, 1), dtype='double')]) + Ve = np.hstack([V, np.ones((Nvalid, 1), dtype='double')]) + + # Loop over all validation samples + for i in range(0, Nvalid): + NEIGHBORSs = NEIGHBORS[i] # Get neighbors for the current sample + nneighbors[i] = len(NEIGHBORSs) + # Perform iterative fitting + for h in range(0, fitIterations): + # Build the design matrix (A) and target vector (B) for the neighbors + A = Te[NEIGHBORSs['index']] + B = z[NEIGHBORSs['index']] + + # Solve the least squares problem + X = np.linalg.lstsq(A, B, rcond=-1) + residuals = B - np.dot(A, X[0]) # Compute residuals + + # Identify outliers using a 3-sigma threshold + abs_residuals = np.abs(residuals) + sigma3 = 3.0 * np.mean(abs_residuals) + selection = (abs_residuals < sigma3) + + # Update the number of selected neighbors + nsel = np.sum(selection) + nneighbors[i] = nsel + + # If enough neighbors remain, update NEIGHBORSs; otherwise, stop iteration + if nsel > 10: + NEIGHBORSs = NEIGHBORSs[selection] + else: # pragma: no cover + break + + # Save the solution vector + C[i] = X[0] + + # Compute the photometric redshift fit for the current sample + photozfit[i] = np.inner(X[0], Ve[i]) + nneighbors[i] = NEIGHBORSs.shape[0] + # Calculate error metrics + if X[1].size != 0: # Check if residuals from the least squares solution exist + # Compute the error based in parameters + param_error = np.abs(X[0][:-1]) * Verr[i] + photozerr_param[i] = np.sqrt(np.sum(param_error**2)) # ,axis=1)) + + # Compute the error based in fit + photozerr_fit[i] = np.sqrt(X[1] / nneighbors[i]) + + # Combine errors to get total redshfit error + photozerr[i] = np.sqrt(photozerr_param[i]**2 + photozerr_fit[i]**2) + + # Assign the computed redshift fits to the output variable + photoz = photozfit + + return photoz, photozerr, photozerr_param, photozerr_fit, nneighbors, C + + +def compute_pdfs(zpdf, wpdf, pdf, Nvalid, zgrid): + """ + Compute the pdfs from neighbor redshifts and weights. """ - return 42 - - + Nvalid = zpdf.shape[0] + + # Initialize PDF-related variables if required + if pdf: + Vpdf = np.zeros((Nvalid, len(zgrid)), dtype='double') + else: # pragma: no cover + Vpdf = 0 + + bin_indices = np.digitize(zpdf, zgrid) # Indices de los bins para cada elemento de zpdf (l x m) + + # Mask matrix for each bin (l x m x len(bins)) + masks = (bin_indices[..., np.newaxis] == np.arange(len(zgrid))) + + # mask per weights and sum by columns + histograms = np.einsum('ij,ijk->ik', wpdf, masks) + Vpdf = histograms + + return Vpdf diff --git a/tests/example_package/test_example_module.py b/tests/example_package/test_example_module.py deleted file mode 100644 index bf09628..0000000 --- a/tests/example_package/test_example_module.py +++ /dev/null @@ -1,13 +0,0 @@ -from rail.example_package import example_module - - -def test_greetings() -> None: - """Verify the output of the `greetings` function""" - output = example_module.greetings() - assert output == "Hello from LINCC-Frameworks!" - - -def test_meaning() -> None: - """Verify the output of the `meaning` function""" - output = example_module.meaning() - assert output == 42 diff --git a/tests/test_algos.py b/tests/test_algos.py new file mode 100644 index 0000000..72cf7d9 --- /dev/null +++ b/tests/test_algos.py @@ -0,0 +1,90 @@ +import pytest +# from rail.core.stage import RailStage +from rail.utils.testing_utils import one_algo +from rail.estimation.algos import dnf + + +def test_dnf_ANF(): + train_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'model': 'model.tmp'} + estim_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'min_n': 15, 'bad_redshift_val': 99., + 'bad_redshift_err': 10., + 'model': 'model.tmp', + 'nondetect_replace': True} + + train_algo = dnf.DNFInformer + pz_algo = dnf.DNFEstimator + results, rerun_results, _ = one_algo("DNF_ANF", train_algo, pz_algo, train_config_dict, estim_config_dict) + + +def test_dnf_ENF(): + train_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'model': 'model.tmp'} + estim_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'min_n': 15, 'bad_redshift_val': 99., + 'bad_redshift_err': 10., + 'model': 'model.tmp', + 'nondetect_replace': True, + 'selection_mode': 0} + + train_algo = dnf.DNFInformer + pz_algo = dnf.DNFEstimator + results, rerun_results, _ = one_algo("DNF_ENF", train_algo, pz_algo, train_config_dict, estim_config_dict) + + +def test_dnf_DNF(): + train_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'model': 'model.tmp'} + estim_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'min_n': 15, 'bad_redshift_val': 99., + 'bad_redshift_err': 10., + 'model': 'model.tmp', + 'nondetect_replace': True, + 'selection_mode': 2} + + train_algo = dnf.DNFInformer + pz_algo = dnf.DNFEstimator + results, rerun_results, _ = one_algo("DNF_DNF", train_algo, pz_algo, train_config_dict, estim_config_dict) + + +def test_dnf_badval(): + train_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'model': 'model.tmp'} + estim_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'min_n': 15, 'bad_redshift_val': 99., + 'bad_redshift_err': 10., + 'model': 'model.tmp', + 'nondetect_replace': True, + 'selection_mode': "BADVAL"} + + train_algo = dnf.DNFInformer + pz_algo = dnf.DNFEstimator + with pytest.raises(TypeError): + results, rerun_results, _ = one_algo("DNF_DNF", train_algo, pz_algo, train_config_dict, estim_config_dict) + + +def test_dnf_badint(): + train_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'model': 'model.tmp'} + estim_config_dict = {'zmin': 0.0, 'zmax': 3.0, 'nzbins': 301, + 'hdf5_groupname': 'photometry', + 'min_n': 15, 'bad_redshift_val': 99., + 'bad_redshift_err': 10., + 'model': 'model.tmp', + 'nondetect_replace': True, + 'selection_mode': 99} + + train_algo = dnf.DNFInformer + pz_algo = dnf.DNFEstimator + with pytest.raises(ValueError): + results, rerun_results, _ = one_algo("DNF_DNF", train_algo, pz_algo, train_config_dict, estim_config_dict)