Download CPCB live data

Download CPCB data with selenium
Data
Author

Zeel B Patel

Published

December 10, 2024

import os
import re
from glob import glob
import pandas as pd
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from time import sleep

HOME_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing"
DOWNLOAD_OLD_DATA_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/caaqm-data-repository"
DOWNLOAD_PAGE_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/data"
def click_it(driver, element):
    driver.execute_script("arguments[0].click();", element)
    
def find_it(element, option):
    return element.find_element(By.XPATH, f"//li[contains(text(), '{option}')]")

def select_dropdown_option(driver, element, option):
    element.click()
    option = find_it(element, option)
    click_it(driver, option)

Dry run to get metadata

# headless chrome
options = Options()
options.add_argument("--headless")

# open the browser
driver = webdriver.Chrome(options=options)

# open the website
driver.get(DOWNLOAD_OLD_DATA_URL)

# wait for the page to load and the dropdowns to appear
dropdowns = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".select-box")))
len(dropdowns)
5
drop_data_type, drop_frequency, drop_states, drop_cities, drop_stations = dropdowns
# Select data type
select_dropdown_option(driver, drop_data_type, "Raw data")

# Select frequency
select_dropdown_option(driver, drop_frequency, "1 day")

# Get the states
drop_states.click() # Open the dropdown
states = drop_states.text.replace("▲\n", "").split("\n")
print("Number of states:", len(states))
drop_states.click() # Close the dropdown
Number of states: 31
metadata_df = pd.DataFrame(columns=["State", "City", "Station", "site_id"])

# This loop took less than a minute to run
progress_bar = tqdm(total=600) # as of 2024, 560 stations. update this number if it changes
for state in states:
    select_dropdown_option(driver, drop_states, state)
    
    # Get all cities
    drop_cities.click() # Open the dropdown
    cities = drop_cities.text.replace("▲\n", "").split("\n")
    drop_cities.click() # Close the dropdown
    
    for city in cities:
        select_dropdown_option(driver, drop_cities, city)
        
        # Get all stations
        drop_stations.click() # Open the dropdown
        stations = drop_stations.text.replace("▲\n", "").split("\n")
        drop_stations.click() # Close the dropdown
        
        for station in stations:
            # corner cases
            if station == "Municipal Corporation Office, Dharuhera - HSPCB":
                site_id = "site_5044"
            elif station == "Civil Lines, Ajmer - RSPCB":
                site_id = "site_1392"
            else:
                try:
                    select_dropdown_option(driver, drop_stations, station)
                except:
                    print("Unable to select station")
                    print(station)
                    print(drop_stations.text)
                    continue
                site_id = drop_stations.get_attribute("ng-reflect-model")
            metadata_df.loc[len(metadata_df)] = [state, city, station, site_id]
            progress_bar.update(1)
len(metadata_df)
560
metadata_df.head()
State City Station site_id
0 Andhra Pradesh Amaravati Secretariat, Amaravati - APPCB site_1406
1 Andhra Pradesh Anantapur Gulzarpet, Anantapur - APPCB site_5632
2 Andhra Pradesh Chittoor Gangineni Cheruvu, Chittoor - APPCB site_5665
3 Andhra Pradesh Kadapa Yerramukkapalli, Kadapa - APPCB site_5693
4 Andhra Pradesh Rajamahendravaram Anand Kala Kshetram, Rajamahendravaram - APPCB site_1399
metadata_df.tail()
State City Station site_id
555 West Bengal Kolkata Rabindra Bharati University, Kolkata - WBPCB site_296
556 West Bengal Kolkata Fort William, Kolkata - WBPCB site_5110
557 West Bengal Kolkata Victoria, Kolkata - WBPCB site_309
558 West Bengal Kolkata Bidhannagar, Kolkata - WBPCB site_5129
559 West Bengal Siliguri Ward-32 Bapupara, Siliguri - WBPCB site_1419
for site_id, more_than_1 in (metadata_df.site_id.value_counts() > 1).items():
    if more_than_1:
        print(metadata_df[metadata_df.site_id == site_id])
           State        City                               Station    site_id
25         Bihar  Aurangabad  MIDC Chilkalthana, Aurangabad - MPCB  site_5788
254  Maharashtra  Aurangabad  MIDC Chilkalthana, Aurangabad - MPCB  site_5788
           State        City                              Station   site_id
26         Bihar  Aurangabad  More Chowk Waluj, Aurangabad - MPCB  site_198
255  Maharashtra  Aurangabad  More Chowk Waluj, Aurangabad - MPCB  site_198
             State           City                                    Station  \
499  Uttar Pradesh  Greater Noida  Knowledge Park - V, Greater Noida - UPPCB   
526  Uttar Pradesh          Noida  Knowledge Park - V, Greater Noida - UPPCB   

       site_id  
499  site_5121  
526  site_5121  
             State           City  \
498  Uttar Pradesh  Greater Noida   
525  Uttar Pradesh          Noida   

                                         Station    site_id  
498  Knowledge Park - III, Greater Noida - UPPCB  site_1541  
525  Knowledge Park - III, Greater Noida - UPPCB  site_1541  
           State        City                              Station    site_id
28         Bihar  Aurangabad  Rachnakar Colony, Aurangabad - MPCB  site_5789
257  Maharashtra  Aurangabad  Rachnakar Colony, Aurangabad - MPCB  site_5789
           State        City                           Station    site_id
27         Bihar  Aurangabad  Gurdeo Nagar, Aurangabad - BSPCB  site_5544
256  Maharashtra  Aurangabad  Gurdeo Nagar, Aurangabad - BSPCB  site_5544
# clean up
drop_items = [metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "MIDC Chilkalthana, Aurangabad - MPCB")].index.item(),
              metadata_df[(metadata_df.City == "Noida") & (metadata_df.Station == "Knowledge Park - III, Greater Noida - UPPCB")].index.item(),
              metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "More Chowk Waluj, Aurangabad - MPCB")].index.item(),
              metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "MIDC Chilkalthana, Aurangabad - MPCB")].index.item(),
              metadata_df[(metadata_df.State == "Maharashtra") & (metadata_df.Station == "Gurdeo Nagar, Aurangabad - BSPCB")].index.item(),
              metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "Rachnakar Colony, Aurangabad - MPCB")].index.item(),
              metadata_df[(metadata_df.City == "Noida") & (metadata_df.Station == "Knowledge Park - V, Greater Noida - UPPCB")].index.item()]

metadata_df.drop(drop_items, inplace=True)
len(metadata_df)
554
assert set(metadata_df.site_id.value_counts()) == {1}
metadata_df.to_csv("metadata.csv", index=False)

Downloading data

# URL is specific to PM2.5 and PM10 so update it as per your needs
def get_url(state, city, site_id):
    return f"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-view-data-report/%2522%257B%255C%2522parameter_list%255C%2522%253A%255B%257B%255C%2522id%255C%2522%253A0%252C%255C%2522itemName%255C%2522%253A%255C%2522PM2.5%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_193%255C%2522%257D%252C%257B%255C%2522id%255C%2522%253A1%252C%255C%2522itemName%255C%2522%253A%255C%2522PM10%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_215%255C%2522%257D%255D%252C%255C%2522criteria%255C%2522%253A%255C%252224%2520Hours%255C%2522%252C%255C%2522reportFormat%255C%2522%253A%255C%2522Tabular%255C%2522%252C%255C%2522fromDate%255C%2522%253A%255C%252201-01-2024%2520T00%253A00%253A00Z%255C%2522%252C%255C%2522toDate%255C%2522%253A%255C%252211-12-2024%2520T16%253A45%253A59Z%255C%2522%252C%255C%2522state%255C%2522%253A%255C%2522{state.replace(' ', '%2520')}%255C%2522%252C%255C%2522city%255C%2522%253A%255C%2522{city.replace(' ', '%2520')}%255C%2522%252C%255C%2522station%255C%2522%253A%255C%2522{site_id}%255C%2522%252C%255C%2522parameter%255C%2522%253A%255B%255C%2522parameter_193%255C%2522%252C%255C%2522parameter_215%255C%2522%255D%252C%255C%2522parameterNames%255C%2522%253A%255B%255C%2522PM2.5%255C%2522%252C%255C%2522PM10%255C%2522%255D%257D%2522"
# add download directory
options = webdriver.ChromeOptions()
options.add_experimental_option("prefs", {
    "download.default_directory": "/Users/project561/cpcb_downloads"
})

driver = webdriver.Chrome(options=options)
driver.get(HOME_URL)

Enter Captcha manually before moving ahead

metadata_df = pd.read_csv("metadata.csv")
metadata_df.head(2)
State City Station site_id
0 Andhra Pradesh Amaravati Secretariat, Amaravati - APPCB site_1406
1 Andhra Pradesh Anantapur Gulzarpet, Anantapur - APPCB site_5632
files = glob("/Users/project561/cpcb_downloads/*.xlsx")
print("Number of files in the download directory:", len(files))
site_ids = [re.search(r"site_\d+?2024", file).group()[:-4] for file in files]
# assert len(set(site_ids)) == len(site_ids), pd.Series(site_ids).value_counts()
site_ids = set(site_ids)

for i in range(len(metadata_df)):
    state, city, station, site_id = metadata_df.iloc[i]
    if site_id in site_ids:
        # print("Already downloaded", i, state, city, station, site_id)
        continue
    print("Downloading", i, state, city, station, site_id)
    url = get_url(state, city, site_id)
    
    # open new tab
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[-1])
    driver.get(url)
    excel_button = WebDriverWait(driver, 20).until(
    EC.element_to_be_clickable((By.CLASS_NAME, "fa-file-excel-o")))
    click_it(driver, excel_button)
    sleep(1)
    
    if len(driver.window_handles) > 10:
        # close first 9 windows
        for _ in range(9):
            driver.switch_to.window(driver.window_handles[0])
            driver.close()
            
        driver.switch_to.window(driver.window_handles[-1])
        sleep(1)
Number of files in the download directory: 302
Downloading 301 Maharashtra Nagpur Ram Nagar, Nagpur - MPCB site_5793
Downloading 302 Maharashtra Nagpur Mahal, Nagpur - MPCB site_5796
Downloading 303 Maharashtra Nagpur Opp GPO Civil Lines, Nagpur - MPCB site_303
Downloading 304 Maharashtra Nagpur Ambazari, Nagpur - MPCB site_5792
Downloading 305 Maharashtra Nanded Sneh Nagar, Nanded - MPCB site_5795
Downloading 306 Maharashtra Nashik Pandav Nagari, Nashik - MPCB site_5779
Downloading 307 Maharashtra Nashik MIDC Ambad, Nashik - MPCB site_5781
Downloading 308 Maharashtra Nashik Gangapur Road, Nashik - MPCB site_304
Downloading 309 Maharashtra Nashik Hirawadi, Nashik - MPCB site_5782
Downloading 310 Maharashtra Navi Mumbai Tondare-Taloja, Navi Mumbai - MPCB site_5803
Downloading 311 Maharashtra Navi Mumbai Sanpada, Navi Mumbai - MPCB site_5815
Downloading 312 Maharashtra Navi Mumbai Airoli, Navi Mumbai - MPCB site_261
Downloading 313 Maharashtra Navi Mumbai Mahape, Navi Mumbai - MPCB site_5114
Downloading 314 Maharashtra Navi Mumbai Kopripada-Vashi, Navi Mumbai - MPCB site_5805
Downloading 315 Maharashtra Navi Mumbai Sector-19A Nerul, Navi Mumbai - IITM site_5401
Downloading 316 Maharashtra Navi Mumbai Nerul, Navi Mumbai - MPCB site_5103
Downloading 317 Maharashtra Navi Mumbai Sector-2E Kalamboli, Navi Mumbai - MPCB site_5799
Downloading 318 Maharashtra Parbhani Masoom Colony, Parbhani - MPCB site_5794
Downloading 319 Maharashtra Pimpri-Chinchwad Park Street Wakad, Pimpri Chinchwad - MPCB site_5764
Downloading 320 Maharashtra Pimpri-Chinchwad Savta Mali Nagar, Pimpri-Chinchwad - IITM site_5998
Downloading 321 Maharashtra Pimpri-Chinchwad Thergaon, Pimpri Chinchwad - MPCB site_5765
Downloading 322 Maharashtra Pimpri-Chinchwad Gavalinagar, Pimpri Chinchwad - MPCB site_5763
Downloading 323 Maharashtra Pune Revenue Colony-Shivajinagar, Pune - IITM site_5409
Downloading 324 Maharashtra Pune Mhada Colony, Pune - IITM site_5404
Downloading 325 Maharashtra Pune Savitribai Phule Pune University, Pune - MPCB site_5767
Downloading 326 Maharashtra Pune Bhumkar Nagar, Pune - IITM site_5988
Downloading 327 Maharashtra Pune Hadapsar, Pune - IITM site_5407
Downloading 328 Maharashtra Pune Karve Road, Pune - MPCB site_292
Downloading 329 Maharashtra Pune Alandi, Pune - IITM site_5405