Example of downloading and processing SDSS spectraΒΆ
This is the code used to create the files fetched by the routine
fetch_sdss_corrected_spectra()
. Be aware that this routine
downloads a large amount of data (~700MB for 4000 spectra) and takes
a long time to run (~30 minutes for 4000 spectra).
# Author: Jake VanderPlas <vanderplas@astro.washington.edu>
# License: BSD
# The figure is an example from astroML: see http://astroML.github.com
import sys
from urllib.error import HTTPError
import numpy as np
from astroML.datasets import fetch_sdss_spectrum
from astroML.datasets.tools import query_plate_mjd_fiber, TARGET_GALAXY
from astroML.dimensionality import iterative_pca
def fetch_and_shift_spectra(n_spectra,
outfile,
primtarget=TARGET_GALAXY,
zlim=(0, 0.7),
loglam_start=3.5,
loglam_end=3.9,
Nlam=1000):
"""
This function queries CAS for matching spectra, and then downloads
them and shifts them to a common redshift binning
"""
# First query for the list of spectra to download
plate, mjd, fiber = query_plate_mjd_fiber(n_spectra, primtarget,
zlim[0], zlim[1])
# Set up arrays to hold information gathered from the spectra
spec_cln = np.zeros(n_spectra, dtype=np.int32)
lineindex_cln = np.zeros(n_spectra, dtype=np.int32)
log_NII_Ha = np.zeros(n_spectra, dtype=np.float32)
log_OIII_Hb = np.zeros(n_spectra, dtype=np.float32)
z = np.zeros(n_spectra, dtype=np.float32)
zerr = np.zeros(n_spectra, dtype=np.float32)
spectra = np.zeros((n_spectra, Nlam), dtype=np.float32)
mask = np.zeros((n_spectra, Nlam), dtype=np.bool)
# Calculate new wavelength coefficients
new_coeff0 = loglam_start
new_coeff1 = (loglam_end - loglam_start) / Nlam
# Now download all the needed spectra, and resample to a common
# wavelength bin.
n_spectra = len(plate)
num_skipped = 0
i = 0
while i < n_spectra:
sys.stdout.write(' %i / %i spectra\r' % (i + 1, n_spectra))
sys.stdout.flush()
try:
spec = fetch_sdss_spectrum(plate[i], mjd[i], fiber[i])
except HTTPError:
num_skipped += 1
print("%i, %i, %i not found" % (plate[i], mjd[i], fiber[i]))
i += 1
continue
spec_rebin = spec.restframe().rebin(new_coeff0, new_coeff1, Nlam)
if np.all(spec_rebin.spectrum == 0):
num_skipped += 1
print("%i, %i, %i is all zero" % (plate[i], mjd[i], fiber[i]))
i += 1
continue
spec_cln[i] = spec.spec_cln
lineindex_cln[i], (log_NII_Ha[i], log_OIII_Hb[i])\
= spec.lineratio_index()
z[i] = spec.z
zerr[i] = spec.zerr
spectra[i] = spec_rebin.spectrum
mask[i] = spec_rebin.compute_mask(0.5, 5)
i += 1
sys.stdout.write('\n')
N = i
print(" %i spectra skipped" % num_skipped)
print(" %i spectra processed" % N)
print("saving to %s" % outfile)
np.savez(outfile,
spectra=spectra[:N],
mask=mask[:N],
coeff0=new_coeff0,
coeff1=new_coeff1,
spec_cln=spec_cln[:N],
lineindex_cln=lineindex_cln[:N],
log_NII_Ha=log_NII_Ha[:N],
log_OIII_Hb=log_OIII_Hb[:N],
z=z[:N],
zerr=zerr[:N])
def spec_iterative_pca(outfile, n_ev=10, n_iter=20, norm='L2'):
"""
This function takes the file outputted above, performs an iterative
PCA to fill in the gaps, and appends the results to the same file.
"""
data_in = np.load(outfile)
spectra = data_in['spectra']
mask = data_in['mask']
res = iterative_pca(spectra, mask,
n_ev=n_ev, n_iter=n_iter, norm=norm,
full_output=True)
input_dict = {key: data_in[key] for key in data_in.files}
# don't save the reconstructed spectrum: this can easily
# be recomputed from the other parameters.
input_dict['mu'] = res[1]
input_dict['evecs'] = res[2]
input_dict['evals'] = res[3]
input_dict['norms'] = res[4]
input_dict['coeffs'] = res[5]
np.savez(outfile, **input_dict)
if __name__ == '__main__':
fetch_and_shift_spectra(4000, 'spec4000.npz')
spec_iterative_pca('spec4000.npz')