import os
import re
from glob import glob
import pandas as pd
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from time import sleep
= "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing"
HOME_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/caaqm-data-repository"
DOWNLOAD_OLD_DATA_URL = "https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing/data" DOWNLOAD_PAGE_URL
def click_it(driver, element):
"arguments[0].click();", element)
driver.execute_script(
def find_it(element, option):
return element.find_element(By.XPATH, f"//li[contains(text(), '{option}')]")
def select_dropdown_option(driver, element, option):
element.click()= find_it(element, option)
option click_it(driver, option)
Dry run to get metadata
# headless chrome
= Options()
options "--headless")
options.add_argument(
# open the browser
= webdriver.Chrome(options=options)
driver
# open the website
driver.get(DOWNLOAD_OLD_DATA_URL)
# wait for the page to load and the dropdowns to appear
= WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".select-box")))
dropdowns len(dropdowns)
5
= dropdowns drop_data_type, drop_frequency, drop_states, drop_cities, drop_stations
# Select data type
"Raw data")
select_dropdown_option(driver, drop_data_type,
# Select frequency
"1 day")
select_dropdown_option(driver, drop_frequency,
# Get the states
# Open the dropdown
drop_states.click() = drop_states.text.replace("▲\n", "").split("\n")
states print("Number of states:", len(states))
# Close the dropdown drop_states.click()
Number of states: 31
= pd.DataFrame(columns=["State", "City", "Station", "site_id"])
metadata_df
# This loop took less than a minute to run
= tqdm(total=600) # as of 2024, 560 stations. update this number if it changes
progress_bar for state in states:
select_dropdown_option(driver, drop_states, state)
# Get all cities
# Open the dropdown
drop_cities.click() = drop_cities.text.replace("▲\n", "").split("\n")
cities # Close the dropdown
drop_cities.click()
for city in cities:
select_dropdown_option(driver, drop_cities, city)
# Get all stations
# Open the dropdown
drop_stations.click() = drop_stations.text.replace("▲\n", "").split("\n")
stations # Close the dropdown
drop_stations.click()
for station in stations:
# corner cases
if station == "Municipal Corporation Office, Dharuhera - HSPCB":
= "site_5044"
site_id elif station == "Civil Lines, Ajmer - RSPCB":
= "site_1392"
site_id else:
try:
select_dropdown_option(driver, drop_stations, station)except:
print("Unable to select station")
print(station)
print(drop_stations.text)
continue
= drop_stations.get_attribute("ng-reflect-model")
site_id len(metadata_df)] = [state, city, station, site_id]
metadata_df.loc[1) progress_bar.update(
len(metadata_df)
560
metadata_df.head()
State | City | Station | site_id | |
---|---|---|---|---|
0 | Andhra Pradesh | Amaravati | Secretariat, Amaravati - APPCB | site_1406 |
1 | Andhra Pradesh | Anantapur | Gulzarpet, Anantapur - APPCB | site_5632 |
2 | Andhra Pradesh | Chittoor | Gangineni Cheruvu, Chittoor - APPCB | site_5665 |
3 | Andhra Pradesh | Kadapa | Yerramukkapalli, Kadapa - APPCB | site_5693 |
4 | Andhra Pradesh | Rajamahendravaram | Anand Kala Kshetram, Rajamahendravaram - APPCB | site_1399 |
metadata_df.tail()
State | City | Station | site_id | |
---|---|---|---|---|
555 | West Bengal | Kolkata | Rabindra Bharati University, Kolkata - WBPCB | site_296 |
556 | West Bengal | Kolkata | Fort William, Kolkata - WBPCB | site_5110 |
557 | West Bengal | Kolkata | Victoria, Kolkata - WBPCB | site_309 |
558 | West Bengal | Kolkata | Bidhannagar, Kolkata - WBPCB | site_5129 |
559 | West Bengal | Siliguri | Ward-32 Bapupara, Siliguri - WBPCB | site_1419 |
for site_id, more_than_1 in (metadata_df.site_id.value_counts() > 1).items():
if more_than_1:
print(metadata_df[metadata_df.site_id == site_id])
State City Station site_id
25 Bihar Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788
254 Maharashtra Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788
State City Station site_id
26 Bihar Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198
255 Maharashtra Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198
State City Station \
499 Uttar Pradesh Greater Noida Knowledge Park - V, Greater Noida - UPPCB
526 Uttar Pradesh Noida Knowledge Park - V, Greater Noida - UPPCB
site_id
499 site_5121
526 site_5121
State City \
498 Uttar Pradesh Greater Noida
525 Uttar Pradesh Noida
Station site_id
498 Knowledge Park - III, Greater Noida - UPPCB site_1541
525 Knowledge Park - III, Greater Noida - UPPCB site_1541
State City Station site_id
28 Bihar Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789
257 Maharashtra Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789
State City Station site_id
27 Bihar Aurangabad Gurdeo Nagar, Aurangabad - BSPCB site_5544
256 Maharashtra Aurangabad Gurdeo Nagar, Aurangabad - BSPCB site_5544
# clean up
= [metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "MIDC Chilkalthana, Aurangabad - MPCB")].index.item(),
drop_items == "Noida") & (metadata_df.Station == "Knowledge Park - III, Greater Noida - UPPCB")].index.item(),
metadata_df[(metadata_df.City == "Bihar") & (metadata_df.Station == "More Chowk Waluj, Aurangabad - MPCB")].index.item(),
metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "MIDC Chilkalthana, Aurangabad - MPCB")].index.item(),
metadata_df[(metadata_df.State == "Maharashtra") & (metadata_df.Station == "Gurdeo Nagar, Aurangabad - BSPCB")].index.item(),
metadata_df[(metadata_df.State == "Bihar") & (metadata_df.Station == "Rachnakar Colony, Aurangabad - MPCB")].index.item(),
metadata_df[(metadata_df.State == "Noida") & (metadata_df.Station == "Knowledge Park - V, Greater Noida - UPPCB")].index.item()]
metadata_df[(metadata_df.City
=True)
metadata_df.drop(drop_items, inplacelen(metadata_df)
554
assert set(metadata_df.site_id.value_counts()) == {1}
"metadata.csv", index=False) metadata_df.to_csv(
Downloading data
# URL is specific to PM2.5 and PM10 so update it as per your needs
def get_url(state, city, site_id):
return f"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-view-data-report/%2522%257B%255C%2522parameter_list%255C%2522%253A%255B%257B%255C%2522id%255C%2522%253A0%252C%255C%2522itemName%255C%2522%253A%255C%2522PM2.5%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_193%255C%2522%257D%252C%257B%255C%2522id%255C%2522%253A1%252C%255C%2522itemName%255C%2522%253A%255C%2522PM10%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_215%255C%2522%257D%255D%252C%255C%2522criteria%255C%2522%253A%255C%252224%2520Hours%255C%2522%252C%255C%2522reportFormat%255C%2522%253A%255C%2522Tabular%255C%2522%252C%255C%2522fromDate%255C%2522%253A%255C%252201-01-2024%2520T00%253A00%253A00Z%255C%2522%252C%255C%2522toDate%255C%2522%253A%255C%252211-12-2024%2520T16%253A45%253A59Z%255C%2522%252C%255C%2522state%255C%2522%253A%255C%2522{state.replace(' ', '%2520')}%255C%2522%252C%255C%2522city%255C%2522%253A%255C%2522{city.replace(' ', '%2520')}%255C%2522%252C%255C%2522station%255C%2522%253A%255C%2522{site_id}%255C%2522%252C%255C%2522parameter%255C%2522%253A%255B%255C%2522parameter_193%255C%2522%252C%255C%2522parameter_215%255C%2522%255D%252C%255C%2522parameterNames%255C%2522%253A%255B%255C%2522PM2.5%255C%2522%252C%255C%2522PM10%255C%2522%255D%257D%2522"
# add download directory
= webdriver.ChromeOptions()
options "prefs", {
options.add_experimental_option("download.default_directory": "/Users/project561/cpcb_downloads"
})
= webdriver.Chrome(options=options)
driver driver.get(HOME_URL)
Enter Captcha manually before moving ahead
= pd.read_csv("metadata.csv")
metadata_df 2) metadata_df.head(
State | City | Station | site_id | |
---|---|---|---|---|
0 | Andhra Pradesh | Amaravati | Secretariat, Amaravati - APPCB | site_1406 |
1 | Andhra Pradesh | Anantapur | Gulzarpet, Anantapur - APPCB | site_5632 |
= glob("/Users/project561/cpcb_downloads/*.xlsx")
files print("Number of files in the download directory:", len(files))
= [re.search(r"site_\d+?2024", file).group()[:-4] for file in files]
site_ids # assert len(set(site_ids)) == len(site_ids), pd.Series(site_ids).value_counts()
= set(site_ids)
site_ids
for i in range(len(metadata_df)):
= metadata_df.iloc[i]
state, city, station, site_id if site_id in site_ids:
# print("Already downloaded", i, state, city, station, site_id)
continue
print("Downloading", i, state, city, station, site_id)
= get_url(state, city, site_id)
url
# open new tab
"window.open('');")
driver.execute_script(-1])
driver.switch_to.window(driver.window_handles[
driver.get(url)= WebDriverWait(driver, 20).until(
excel_button "fa-file-excel-o")))
EC.element_to_be_clickable((By.CLASS_NAME,
click_it(driver, excel_button)1)
sleep(
if len(driver.window_handles) > 10:
# close first 9 windows
for _ in range(9):
0])
driver.switch_to.window(driver.window_handles[
driver.close()
-1])
driver.switch_to.window(driver.window_handles[1) sleep(
Number of files in the download directory: 302
Downloading 301 Maharashtra Nagpur Ram Nagar, Nagpur - MPCB site_5793
Downloading 302 Maharashtra Nagpur Mahal, Nagpur - MPCB site_5796
Downloading 303 Maharashtra Nagpur Opp GPO Civil Lines, Nagpur - MPCB site_303
Downloading 304 Maharashtra Nagpur Ambazari, Nagpur - MPCB site_5792
Downloading 305 Maharashtra Nanded Sneh Nagar, Nanded - MPCB site_5795
Downloading 306 Maharashtra Nashik Pandav Nagari, Nashik - MPCB site_5779
Downloading 307 Maharashtra Nashik MIDC Ambad, Nashik - MPCB site_5781
Downloading 308 Maharashtra Nashik Gangapur Road, Nashik - MPCB site_304
Downloading 309 Maharashtra Nashik Hirawadi, Nashik - MPCB site_5782
Downloading 310 Maharashtra Navi Mumbai Tondare-Taloja, Navi Mumbai - MPCB site_5803
Downloading 311 Maharashtra Navi Mumbai Sanpada, Navi Mumbai - MPCB site_5815
Downloading 312 Maharashtra Navi Mumbai Airoli, Navi Mumbai - MPCB site_261
Downloading 313 Maharashtra Navi Mumbai Mahape, Navi Mumbai - MPCB site_5114
Downloading 314 Maharashtra Navi Mumbai Kopripada-Vashi, Navi Mumbai - MPCB site_5805
Downloading 315 Maharashtra Navi Mumbai Sector-19A Nerul, Navi Mumbai - IITM site_5401
Downloading 316 Maharashtra Navi Mumbai Nerul, Navi Mumbai - MPCB site_5103
Downloading 317 Maharashtra Navi Mumbai Sector-2E Kalamboli, Navi Mumbai - MPCB site_5799
Downloading 318 Maharashtra Parbhani Masoom Colony, Parbhani - MPCB site_5794
Downloading 319 Maharashtra Pimpri-Chinchwad Park Street Wakad, Pimpri Chinchwad - MPCB site_5764
Downloading 320 Maharashtra Pimpri-Chinchwad Savta Mali Nagar, Pimpri-Chinchwad - IITM site_5998
Downloading 321 Maharashtra Pimpri-Chinchwad Thergaon, Pimpri Chinchwad - MPCB site_5765
Downloading 322 Maharashtra Pimpri-Chinchwad Gavalinagar, Pimpri Chinchwad - MPCB site_5763
Downloading 323 Maharashtra Pune Revenue Colony-Shivajinagar, Pune - IITM site_5409
Downloading 324 Maharashtra Pune Mhada Colony, Pune - IITM site_5404
Downloading 325 Maharashtra Pune Savitribai Phule Pune University, Pune - MPCB site_5767
Downloading 326 Maharashtra Pune Bhumkar Nagar, Pune - IITM site_5988
Downloading 327 Maharashtra Pune Hadapsar, Pune - IITM site_5407
Downloading 328 Maharashtra Pune Karve Road, Pune - MPCB site_292
Downloading 329 Maharashtra Pune Alandi, Pune - IITM site_5405