""" Example of downloading and processing SDSS spectra -------------------------------------------------- This is the code used to create the files fetched by the routine :func:`fetch_sdss_corrected_spectra`. Be aware that this routine downloads a large amount of data (~700MB for 4000 spectra) and takes a long time to run (~30 minutes for 4000 spectra). """ # Author: Jake VanderPlas # License: BSD # The figure is an example from astroML: see http://astroML.github.com import sys from urllib.error import HTTPError import numpy as np from astroML.datasets import fetch_sdss_spectrum from astroML.datasets.tools import query_plate_mjd_fiber, TARGET_GALAXY from astroML.dimensionality import iterative_pca def fetch_and_shift_spectra(n_spectra, outfile, primtarget=TARGET_GALAXY, zlim=(0, 0.7), loglam_start=3.5, loglam_end=3.9, Nlam=1000): """ This function queries CAS for matching spectra, and then downloads them and shifts them to a common redshift binning """ # First query for the list of spectra to download plate, mjd, fiber = query_plate_mjd_fiber(n_spectra, primtarget, zlim[0], zlim[1]) # Set up arrays to hold information gathered from the spectra spec_cln = np.zeros(n_spectra, dtype=np.int32) lineindex_cln = np.zeros(n_spectra, dtype=np.int32) log_NII_Ha = np.zeros(n_spectra, dtype=np.float32) log_OIII_Hb = np.zeros(n_spectra, dtype=np.float32) z = np.zeros(n_spectra, dtype=np.float32) zerr = np.zeros(n_spectra, dtype=np.float32) spectra = np.zeros((n_spectra, Nlam), dtype=np.float32) mask = np.zeros((n_spectra, Nlam), dtype=np.bool) # Calculate new wavelength coefficients new_coeff0 = loglam_start new_coeff1 = (loglam_end - loglam_start) / Nlam # Now download all the needed spectra, and resample to a common # wavelength bin. n_spectra = len(plate) num_skipped = 0 i = 0 while i < n_spectra: sys.stdout.write(' %i / %i spectra\r' % (i + 1, n_spectra)) sys.stdout.flush() try: spec = fetch_sdss_spectrum(plate[i], mjd[i], fiber[i]) except HTTPError: num_skipped += 1 print("%i, %i, %i not found" % (plate[i], mjd[i], fiber[i])) i += 1 continue spec_rebin = spec.restframe().rebin(new_coeff0, new_coeff1, Nlam) if np.all(spec_rebin.spectrum == 0): num_skipped += 1 print("%i, %i, %i is all zero" % (plate[i], mjd[i], fiber[i])) i += 1 continue spec_cln[i] = spec.spec_cln lineindex_cln[i], (log_NII_Ha[i], log_OIII_Hb[i])\ = spec.lineratio_index() z[i] = spec.z zerr[i] = spec.zerr spectra[i] = spec_rebin.spectrum mask[i] = spec_rebin.compute_mask(0.5, 5) i += 1 sys.stdout.write('\n') N = i print(" %i spectra skipped" % num_skipped) print(" %i spectra processed" % N) print("saving to %s" % outfile) np.savez(outfile, spectra=spectra[:N], mask=mask[:N], coeff0=new_coeff0, coeff1=new_coeff1, spec_cln=spec_cln[:N], lineindex_cln=lineindex_cln[:N], log_NII_Ha=log_NII_Ha[:N], log_OIII_Hb=log_OIII_Hb[:N], z=z[:N], zerr=zerr[:N]) def spec_iterative_pca(outfile, n_ev=10, n_iter=20, norm='L2'): """ This function takes the file outputted above, performs an iterative PCA to fill in the gaps, and appends the results to the same file. """ data_in = np.load(outfile) spectra = data_in['spectra'] mask = data_in['mask'] res = iterative_pca(spectra, mask, n_ev=n_ev, n_iter=n_iter, norm=norm, full_output=True) input_dict = {key: data_in[key] for key in data_in.files} # don't save the reconstructed spectrum: this can easily # be recomputed from the other parameters. input_dict['mu'] = res[1] input_dict['evecs'] = res[2] input_dict['evals'] = res[3] input_dict['norms'] = res[4] input_dict['coeffs'] = res[5] np.savez(outfile, **input_dict) if __name__ == '__main__': fetch_and_shift_spectra(4000, 'spec4000.npz') spec_iterative_pca('spec4000.npz')