# headless chromeoptions = Options()options.add_argument("--headless")# open the browserdriver = webdriver.Chrome(options=options)# open the websitedriver.get(DOWNLOAD_OLD_DATA_URL)# wait for the page to load and the dropdowns to appeardropdowns = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".select-box")))len(dropdowns)
# Select data typeselect_dropdown_option(driver, drop_data_type, "Raw data")# Select frequencyselect_dropdown_option(driver, drop_frequency, "1 day")# Get the statesdrop_states.click() # Open the dropdownstates = drop_states.text.replace("▲\n", "").split("\n")print("Number of states:", len(states))drop_states.click() # Close the dropdown
Number of states: 31
metadata_df = pd.DataFrame(columns=["State", "City", "Station", "site_id"])# This loop took less than a minute to runprogress_bar = tqdm(total=600) # as of 2024, 560 stations. update this number if it changesfor state in states: select_dropdown_option(driver, drop_states, state)# Get all cities drop_cities.click() # Open the dropdown cities = drop_cities.text.replace("▲\n", "").split("\n") drop_cities.click() # Close the dropdownfor city in cities: select_dropdown_option(driver, drop_cities, city)# Get all stations drop_stations.click() # Open the dropdown stations = drop_stations.text.replace("▲\n", "").split("\n") drop_stations.click() # Close the dropdownfor station in stations:# corner casesif station =="Municipal Corporation Office, Dharuhera - HSPCB": site_id ="site_5044"elif station =="Civil Lines, Ajmer - RSPCB": site_id ="site_1392"else:try: select_dropdown_option(driver, drop_stations, station)except:print("Unable to select station")print(station)print(drop_stations.text)continue site_id = drop_stations.get_attribute("ng-reflect-model") metadata_df.loc[len(metadata_df)] = [state, city, station, site_id] progress_bar.update(1)
len(metadata_df)
560
metadata_df.head()
State
City
Station
site_id
0
Andhra Pradesh
Amaravati
Secretariat, Amaravati - APPCB
site_1406
1
Andhra Pradesh
Anantapur
Gulzarpet, Anantapur - APPCB
site_5632
2
Andhra Pradesh
Chittoor
Gangineni Cheruvu, Chittoor - APPCB
site_5665
3
Andhra Pradesh
Kadapa
Yerramukkapalli, Kadapa - APPCB
site_5693
4
Andhra Pradesh
Rajamahendravaram
Anand Kala Kshetram, Rajamahendravaram - APPCB
site_1399
metadata_df.tail()
State
City
Station
site_id
555
West Bengal
Kolkata
Rabindra Bharati University, Kolkata - WBPCB
site_296
556
West Bengal
Kolkata
Fort William, Kolkata - WBPCB
site_5110
557
West Bengal
Kolkata
Victoria, Kolkata - WBPCB
site_309
558
West Bengal
Kolkata
Bidhannagar, Kolkata - WBPCB
site_5129
559
West Bengal
Siliguri
Ward-32 Bapupara, Siliguri - WBPCB
site_1419
for site_id, more_than_1 in (metadata_df.site_id.value_counts() >1).items():if more_than_1:print(metadata_df[metadata_df.site_id == site_id])
State City Station site_id
25 Bihar Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788
254 Maharashtra Aurangabad MIDC Chilkalthana, Aurangabad - MPCB site_5788
State City Station site_id
26 Bihar Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198
255 Maharashtra Aurangabad More Chowk Waluj, Aurangabad - MPCB site_198
State City Station \
499 Uttar Pradesh Greater Noida Knowledge Park - V, Greater Noida - UPPCB
526 Uttar Pradesh Noida Knowledge Park - V, Greater Noida - UPPCB
site_id
499 site_5121
526 site_5121
State City \
498 Uttar Pradesh Greater Noida
525 Uttar Pradesh Noida
Station site_id
498 Knowledge Park - III, Greater Noida - UPPCB site_1541
525 Knowledge Park - III, Greater Noida - UPPCB site_1541
State City Station site_id
28 Bihar Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789
257 Maharashtra Aurangabad Rachnakar Colony, Aurangabad - MPCB site_5789
State City Station site_id
27 Bihar Aurangabad Gurdeo Nagar, Aurangabad - BSPCB site_5544
256 Maharashtra Aurangabad Gurdeo Nagar, Aurangabad - BSPCB site_5544
# URL is specific to PM2.5 and PM10 so update it as per your needsdef get_url(state, city, site_id):returnf"https://airquality.cpcb.gov.in/ccr/#/caaqm-dashboard-all/caaqm-view-data-report/%2522%257B%255C%2522parameter_list%255C%2522%253A%255B%257B%255C%2522id%255C%2522%253A0%252C%255C%2522itemName%255C%2522%253A%255C%2522PM2.5%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_193%255C%2522%257D%252C%257B%255C%2522id%255C%2522%253A1%252C%255C%2522itemName%255C%2522%253A%255C%2522PM10%255C%2522%252C%255C%2522itemValue%255C%2522%253A%255C%2522parameter_215%255C%2522%257D%255D%252C%255C%2522criteria%255C%2522%253A%255C%252224%2520Hours%255C%2522%252C%255C%2522reportFormat%255C%2522%253A%255C%2522Tabular%255C%2522%252C%255C%2522fromDate%255C%2522%253A%255C%252201-01-2024%2520T00%253A00%253A00Z%255C%2522%252C%255C%2522toDate%255C%2522%253A%255C%252211-12-2024%2520T16%253A45%253A59Z%255C%2522%252C%255C%2522state%255C%2522%253A%255C%2522{state.replace(' ', '%2520')}%255C%2522%252C%255C%2522city%255C%2522%253A%255C%2522{city.replace(' ', '%2520')}%255C%2522%252C%255C%2522station%255C%2522%253A%255C%2522{site_id}%255C%2522%252C%255C%2522parameter%255C%2522%253A%255B%255C%2522parameter_193%255C%2522%252C%255C%2522parameter_215%255C%2522%255D%252C%255C%2522parameterNames%255C%2522%253A%255B%255C%2522PM2.5%255C%2522%252C%255C%2522PM10%255C%2522%255D%257D%2522"
files = glob("/Users/project561/cpcb_downloads/*.xlsx")print("Number of files in the download directory:", len(files))site_ids = [re.search(r"site_\d+?2024", file).group()[:-4] forfilein files]# assert len(set(site_ids)) == len(site_ids), pd.Series(site_ids).value_counts()site_ids =set(site_ids)for i inrange(len(metadata_df)): state, city, station, site_id = metadata_df.iloc[i]if site_id in site_ids:# print("Already downloaded", i, state, city, station, site_id)continueprint("Downloading", i, state, city, station, site_id) url = get_url(state, city, site_id)# open new tab driver.execute_script("window.open('');") driver.switch_to.window(driver.window_handles[-1]) driver.get(url) excel_button = WebDriverWait(driver, 20).until( EC.element_to_be_clickable((By.CLASS_NAME, "fa-file-excel-o"))) click_it(driver, excel_button) sleep(1)iflen(driver.window_handles) >10:# close first 9 windowsfor _ inrange(9): driver.switch_to.window(driver.window_handles[0]) driver.close() driver.switch_to.window(driver.window_handles[-1]) sleep(1)