Programatically download OpenAQ data

Programatically download OpenAQ data
Author

Zeel B Patel

Published

September 21, 2020

# uncomment to install these libraries
# !pip install boto3 botocore

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import boto3
import botocore
import os
from IPython.display import clear_output

Setup

s3 = boto3.client('s3', config=botocore.config.Config(signature_version=botocore.UNSIGNED))
bucket_name = 'openaq-fetches'
prefix = 'realtime-gzipped/'

path = '/content/drive/MyDrive/IJCAI-21/data/OpenAQ-Delhi/'

start_date = '2020/01/01' # start date (inclusive)
end_date = '2020/12/31' # end date (inclusive)

Download

for date in pd.date_range(start=start_date, end=end_date):
  clear_output(wait=True)
  date = str(date).split(' ')[0] # keeping just YYYY-MM-DD from YYYY-MM-DD HH:MM:SS
  print('Downloading:', date)
  data_dict = s3.list_objects(Bucket = bucket_name, Prefix = prefix+date)
  
  for file_obj in data_dict['Contents']:
    f_name = file_obj['Key']
    tmp_path = '/'.join((path+f_name).split('/')[:-1])
    
    if not os.path.exists(tmp_path):
      os.makedirs(tmp_path)
    
    s3.download_file(bucket_name, f_name, path+f_name)
Downloading: 2020-05-04

Validate

for date in pd.date_range(start=start_date, end=end_date):
  date = str(date).split(' ')[0] # keeping just YYYY-MM-DD from YYYY-MM-DD HH:MM:SS
  data_dict = s3.list_objects(Bucket = bucket_name, Prefix = prefix+date)
  
  for file_obj in data_dict['Contents']:
    assert os.path.exists(path+file_obj['Key']), file_obj['Key']


print('Validated')