Download CPCB CAAQM locations

Download CPCB CAAQM locations using Selenium
Data
Author

Zeel B Patel

Published

December 27, 2024

try:
    import selenium
except ModuleNotFoundError:
    %pip install selenium

import os
import re
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm, trange
from time import sleep, time
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

!rm log.txt

def print_it(*args, **kwargs):
    print(*args, **kwargs)
    with open('log.txt', 'a') as f:
        print(*args, **kwargs, file=f)

global_init = time()
rm: log.txt: No such file or directory
# Set up WebDriver
op = webdriver.ChromeOptions()

driver = webdriver.Chrome(options=op)

# Navigate to the website and manually solve the CAPTCHA
driver.get("https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-landing")

Manually solve captcha before moving on to the next cell..

# leaflet-marker-icon custom-div-icon map_markers station_status_live leaflet-zoom-animated leaflet-interactive
all_station_markers = driver.find_elements(By.CLASS_NAME, 'leaflet-marker-icon')

all_stations_len = len(all_station_markers)
print("Total stations: ", all_stations_len)
Total stations:  558
def get_after(string, phrase):
    return string[string.index(phrase) + len(phrase):]

data = {}
all_station_markers = driver.find_elements(By.CLASS_NAME, 'leaflet-marker-icon')
marker_id = 0
progress_bar = tqdm(total=all_stations_len, desc="Progress")
while marker_id < all_stations_len:
    try:
        marker = all_station_markers[marker_id]
        driver.execute_script("arguments[0].click();", marker)
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'close')))
        children = driver.find_elements(By.CLASS_NAME, "col-md-12")
        assert "Station Name" in children[3].text
        
        # parse it
    
        station, address, location = children[3].text.split('\n')
        station = get_after(station, "Station Name: ")
        address = get_after(address, "Address: ")
        latitude, longitude = location.split(",")
        latitude = get_after(latitude, "Latitude: ")
        longitude = get_after(longitude, "Longitude: ")
        
        data[station] = {"address": address, "latitude": float(latitude), "longitude": float(longitude)}
        close = driver.find_element(By.CLASS_NAME, "close")
        close.click()
        sleep(0.5)
        marker_id += 1
        progress_bar.update(1)
    except Exception as e:
        driver.refresh()
        input("Please manually solve the Captcha")
        all_station_markers = driver.find_elements(By.CLASS_NAME, 'leaflet-marker-icon')
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[4], line 12
     10 marker = all_station_markers[marker_id]
     11 driver.execute_script("arguments[0].click();", marker)
---> 12 WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'close')))
     13 children = driver.find_elements(By.CLASS_NAME, "col-md-12")
     14 assert "Station Name" in children[3].text

File /opt/miniconda3/lib/python3.12/site-packages/selenium/webdriver/support/wait.py:102, in WebDriverWait.until(self, method, message)
    100     screen = getattr(exc, "screen", None)
    101     stacktrace = getattr(exc, "stacktrace", None)
--> 102 time.sleep(self._poll)
    103 if time.monotonic() > end_time:
    104     break

KeyboardInterrupt: 
df = pd.DataFrame(data).T
df.index.name = "station"
df.head(2)
address latitude longitude
station
SIDCO Kurichi, Coimbatore - TNPCB SIDCO Kurichi, Coimbatore, Tamil Nadu. 10.942451 76.978996
Muradpur, Patna - BSPCB S K Memorial Hall Premises, Near Gandhi Maidan... 25.619651 85.147382
df.to_csv("station_data.csv")