Hi all,

I am struggling with the speed of my API request.

I want to download daily averages from the “ERA5 pressure levels” dataset for temperature and relative humidity for each pressure level and four different time zones (US time zones) covering the period 2006-2020.

It seems that I can only pass one variable, UTC shift and pressure level per request.

However, one request takes more than 2 minutes.

Thus, it will take too much time to retrieve all the data I need.

Is there any possibility to speed up the process (e.g. request multiple variables, UTC shifts, pressure levels at once)?

I wrote the following code (based on Retrieve daily ERA5/ERA5-Land data using the CDS API ):


# Packages
import cdsapi
import requests
import urllib3
urllib3.disable_warnings()

# PATH
PATH = ".../ERA5_pressure_levels/"
 
# Requires:
# 1) the CDS API to be installed and working on your system
# 2) You have agreed to the ERA5 Licence (via the CDS web page)
# 3) Selection of required variable, daily statistic, etc

# Call API
c = cdsapi.Client(timeout=600)

# Time Zones
UTC =  ["UTC-05", "UTC-06", "UTC-07", "UTC-08"]

# Variables
VAR =  ['temperature', 'relative_humidity']

# Pressure levels
PS = [
            '1', '2', '3',
            '5', '7', '10',
            '20', '30', '50',
            '70', '100', '125',
            '150', '175', '200',
            '225', '250', '300',
            '350', '400', '450',
            '500', '550', '600',
            '650', '700', '750',
            '775', '800', '825',
            '850', '875', '900',
            '925', '950', '975',
            '1000'
        ]

# Years
YEARS =  [
      '2006', '2007', '2008',
      '2009', '2010', '2011',
      '2012', '2013', '2014',
      '2015', '2016', '2017',
      '2018', '2019', '2020',
]

# Months
MONTHS = [
    '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'
    ]

# Loop over all parameters
for yr in YEARS:
    for mn in MONTHS:
        for var in VAR:
            for ps in PS:
                for utc in UTC:
                    
                    print('Running: '+yr+mn+var+ps+utc)        
                    
                    #--- UTC-05 ---#                

                    if utc == "UTC-05":
                        result = c.service(
                            "tool.toolbox.orchestrator.workflow",
                            params={
                                "realm": "user-apps",
                                "project": "app-c3s-daily-era5-statistics",
                                "version": "master",
                                "kwargs": {
                                "dataset": "reanalysis-era5-pressure-levels",
                                "product_type": "reanalysis",
                                "variable": var,
                                'pressure_level': ps,
                                "statistic": "daily_mean",
                                "year": yr,
                                "month": mn,
                                "time_zone": utc+":0",
                                "frequency": "1-hourly",
                                "grid": "0.25/0.25",                        
                                "area": {"lat": [23,50], "lon": [-91,-65]}   
                        },
                        "workflow_name": "application"
                    })

                    #--- UTC-06 ---#

                    if utc == "UTC-06":
                        result = c.service(
                            "tool.toolbox.orchestrator.workflow",
                            params={
                                "realm": "user-apps",
                                "project": "app-c3s-daily-era5-statistics",
                                "version": "master",
                                "kwargs": {
                                "dataset": "reanalysis-era5-pressure-levels",
                                "product_type": "reanalysis",
                                "variable": var,
                                'pressure_level': ps,
                                "statistic": "daily_mean",
                                "year": yr,
                                "month": mn,
                                "time_zone": utc+":0",
                                "frequency": "1-hourly",
                                "grid": "0.25/0.25",
                                "area": {"lat": [24,51], "lon": [-106,-83]}    
                        },
                        "workflow_name": "application"
                    })

                    #--- UTC-07 ---#

                    if utc == "UTC-07":
                        result = c.service(
                            "tool.toolbox.orchestrator.workflow",
                            params={
                                "realm": "user-apps",
                                "project": "app-c3s-daily-era5-statistics",
                                "version": "master",
                                "kwargs": {
                                "dataset": "reanalysis-era5-pressure-levels",
                                "product_type": "reanalysis",
                                "variable": var,
                                'pressure_level': ps,
                                "statistic": "daily_mean",
                                "year": yr,
                                "month": mn,
                                "time_zone": utc+":0",
                                "frequency": "1-hourly",
                                "grid": "0.25/0.25",
                                "area": {"lat": [29,51], "lon": [-120,-99]}    
                        },
                        "workflow_name": "application"
                    })

                    #--- UTC-08 ---#

                    if utc == "UTC-08":
                        result = c.service(
                            "tool.toolbox.orchestrator.workflow",
                            params={
                                "realm": "user-apps",
                                "project": "app-c3s-daily-era5-statistics",
                                "version": "master",
                                "kwargs": {
                                "dataset": "reanalysis-era5-pressure-levels",
                                "product_type": "reanalysis",
                                "variable": var,
                                'pressure_level': ps,
                                "statistic": "daily_mean",
                                "year": yr,
                                "month": mn,
                                "time_zone": utc+":0",
                                "frequency": "1-hourly",
                                "grid": "0.25/0.25",
                                "area": {"lat": [31,51], "lon": [-126,-113]}    
                        },
                        "workflow_name": "application"
                    })

                    # set name of output file for each month (statistic, variable, year, month)
                    file_name = "download_"+utc+ "_"+ ps +"_" + var +"_"+ yr + mn+ ".nc"   
                    w = open(PATH+'Filenames.txt', "a")
                    w.write(file_name + '\n')      
                    location=result[0]['location']    
                    res = requests.get(location, stream = True)
                    print("Writing data to " + file_name)
                    with open(PATH+file_name,'wb') as fh:
                        for r in res.iter_content(chunk_size = 1024 * 1024):
                            fh.write(r)
                    fh.close()
w.close()


2 Comments

  1. Hi Felix,

    At the moment I think requests are limited to 1 variable/level/month at a time in order to manage the load on the CDS,

    Thanks,

    Kevin 

    1. In this case it seems faster to download ERA5 hourly data on pressure levels for the entire US and calculate daily averages for the respective time zones myself. I will share my code for others facing a similar issue:

      # Packages
      import sys
      import xarray as xr
      import numpy as np
      import geopandas as gpd
      import pandas as pd
      import glob
      import dask
      from datetime import timedelta
      
      # ERA5 pressure levels for the entire US (load at least two consecutive years since time shift requires data from 1. January of the following year)
      d = xr.open_mfdataset("ERA5 pressure level for the entire US")
      
      # Split file into different "coordinate time-zone windows"
      d_UTC5 = d.sel(longitude=slice(-91, -65), latitude=slice(50, 23), drop=True)
      d_UTC6 = d.sel(longitude=slice(-106, -83), latitude=slice(51, 24), drop=True)
      d_UTC7 = d.sel(longitude=slice(-120, -99), latitude=slice(51, 29), drop=True)
      d_UTC8 = d.sel(longitude=slice(-126, -113), latitude=slice(51, 31), drop=True)
      
      # Shift time to UTC-5, UTC-6, UTC-7, UTC-8 for the "coordinate time-zone windows"
      d_UTC5['time'] = d_UTC5.time.get_index('time') + timedelta(hours=-5) d_UTC6['time'] = d_UTC6.time.get_index('time') + timedelta(hours=-6) d_UTC7['time'] = d_UTC7.time.get_index('time') + timedelta(hours=-7) d_UTC8['time'] = d_UTC8.time.get_index('time') + timedelta(hours=-8) # Calculate daily averages for respective time-zone coordinate windows and store them in separate netCDF files d_UTC5 = d_UTC5.resample(time='1D').mean('time') d_UTC5.to_netcdf('UTC-5_'+'.nc') d_UTC6 = d_UTC6.resample(time='1D').mean('time') d_UTC6.to_netcdf('UTC-6_'+'.nc') d_UTC7 = d_UTC7.resample(time='1D').mean('time') d_UTC7.to_netcdf('UTC-7_'+'.nc') d_UTC8 = d_UTC8.resample(time='1D').mean('time') d_UTC8.to_netcdf('UTC-8_'+'.nc')