forked from stephenportillo/SDSS-VAE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute_sdss_pca.py
168 lines (137 loc) · 5.81 KB
/
compute_sdss_pca.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
Example of downloading and processing SDSS spectra
--------------------------------------------------
This is the code used to create the files fetched by the routine
:func:`fetch_sdss_corrected_spectra`. Be aware that this routine
downloads a large amount of data (~700MB for 4000 spectra) and takes
a long time to run (~30 minutes for 4000 spectra).
"""
# Author: Jake VanderPlas <vanderplas@astro.washington.edu>
# License: BSD
# This code is an example from astroML: see http://astroML.github.com
# Modified March 2019 by Stephen Portillo <sportill@uw.edu>
from __future__ import print_function, division
import sys
from astroML.py3k_compat import HTTPError
import numpy as np
from astroML.datasets import fetch_sdss_spectrum
from astroML.datasets.tools import query_plate_mjd_fiber, TARGET_GALAXY, TARGET_QSO_CAP, TARGET_QSO_SKIRT
from astroML.dimensionality import iterative_pca
def fetch_and_shift_spectra(n_spectra,
outfile,
primtarget=TARGET_GALAXY,
zlim=(0, 0.7),
loglam_start=3.5,
loglam_end=3.9,
Nlam=1000):
"""
This function queries CAS for matching spectra, and then downloads
them and shifts them to a common redshift binning
"""
# First query for the list of spectra to download
plate, mjd, fiber = query_plate_mjd_fiber(n_spectra, primtarget,
zlim[0], zlim[1])
# Set up arrays to hold information gathered from the spectra
spec_cln = np.zeros(n_spectra, dtype=np.int32)
lineindex_cln = np.zeros(n_spectra, dtype=np.int32)
log_NII_Ha = np.zeros(n_spectra, dtype=np.float32)
log_OIII_Hb = np.zeros(n_spectra, dtype=np.float32)
z = np.zeros(n_spectra, dtype=np.float32)
zerr = np.zeros(n_spectra, dtype=np.float32)
spectra = np.zeros((n_spectra, Nlam), dtype=np.float32)
mask = np.zeros((n_spectra, Nlam), dtype=np.bool)
specerr = np.zeros((n_spectra, Nlam), dtype=np.float32)
# also save plate, mjd, fiber to allow reference to SDSS data
plates = np.zeros(n_spectra, dtype=np.int32)
mjds = np.zeros(n_spectra, dtype=np.int32)
fibers = np.zeros(n_spectra, dtype=np.int32)
# Calculate new wavelength coefficients
new_coeff0 = loglam_start
new_coeff1 = (loglam_end - loglam_start) / Nlam
# Now download all the needed spectra, and resample to a common
# wavelength bin.
n_spectra = len(plate)
num_skipped = 0
# changed counter and loop so that skipped spectra do not create gaps in arrays
j = 0
for i in range(n_spectra):
sys.stdout.write(' %i / %i spectra\r' % (i + 1, n_spectra))
sys.stdout.flush()
try:
spec = fetch_sdss_spectrum(plate[i], mjd[i], fiber[i], data_home='/epyc/users/sportill/specAE/cache')
except HTTPError:
num_skipped += 1
print("%i, %i, %i not found" % (plate[i], mjd[i], fiber[i]))
continue
spec_rebin = spec.restframe().rebin(new_coeff0, new_coeff1, Nlam)
if spec.z < zlim[0] or spec.z > zlim[1]:
num_skipped += 1
print("%i, %i, %i outside redshift range" % (plate[i], mjd[i], fiber[i]))
continue
if np.all(spec_rebin.spectrum == 0):
num_skipped += 1
print("%i, %i, %i is all zero" % (plate[i], mjd[i], fiber[i]))
continue
#if spec.spec_cln < 2 or spec.spec_cln > 3:
# num_skipped += 1
# print("%i, %i, %i is not a galaxy spectrum" % (plate[i], mjd[i], fiber[i]))
# continue
spec_cln[j] = spec.spec_cln
lineindex_cln[j], (log_NII_Ha[j], log_OIII_Hb[j])\
= spec.lineratio_index()
z[j] = spec.z
zerr[j] = spec.zerr
spectra[j] = spec_rebin.spectrum
mask[j] = spec_rebin.compute_mask(0.5, 5)
assert((mask[j] == 0).any())
specerr[j] = spec_rebin.error
plates[j] = plate[i]
mjds[j] = mjd[i]
fibers[j] = fiber[i]
j += 1
sys.stdout.write('\n')
N = j
print(" %i spectra skipped" % num_skipped)
print(" %i spectra processed" % N)
print("saving to %s" % outfile)
np.savez(outfile,
spectra=spectra[:N],
mask=mask[:N],
spec_err=specerr[:N],
coeff0=new_coeff0,
coeff1=new_coeff1,
spec_cln=spec_cln[:N],
lineindex_cln=lineindex_cln[:N],
log_NII_Ha=log_NII_Ha[:N],
log_OIII_Hb=log_OIII_Hb[:N],
z=z[:N],
zerr=zerr[:N],
plate=plates[:N],
mjd=mjds[:N],
fiber=fibers[:N])
def spec_iterative_pca(outfile, n_ev=10, n_iter=20, norm='L2'):
"""
This function takes the file outputted above, performs an iterative
PCA to fill in the gaps, and appends the results to the same file.
"""
data_in = np.load(outfile)
spectra = data_in['spectra']
mask = data_in['mask']
res = iterative_pca(spectra, mask,
n_ev=n_ev, n_iter=n_iter, norm=norm,
full_output=True)
input_dict = dict([(key, data_in[key]) for key in data_in.files])
# don't save the reconstructed spectrum: this can easily
# be recomputed from the other parameters.
input_dict['mu'] = res[1]
input_dict['evecs'] = res[2]
input_dict['evals'] = res[3]
input_dict['norms'] = res[4]
input_dict['coeffs'] = res[5]
np.savez(outfile, **input_dict)
if __name__ == '__main__':
# download from main galaxy sample and low redshift quasars
# limit redshift to 0.36
fetch_and_shift_spectra(64000, 'spec64k.npz', primtarget=TARGET_GALAXY+TARGET_QSO_CAP+TARGET_QSO_SKIRT, zlim=(0, 0.36),
loglam_start=3.53, loglam_end=3.92)
spec_iterative_pca('spec64k.npz')