Source code for scrape_numbeo_indices

import pandas as pd

import scrape_urls


[docs]def to_pandas_df(rows: list) -> pd.DataFrame: """ Converts a list of HTML rows to a pandas dataframe. :param rows: list of rows :type rows: list :return: pandas dataframe :rtype: pd.DataFrame """ table = [] headers = [] for element in rows[0].find_all('th'): headers.append(element.text) table.append(headers) for row in rows[1:]: row_elements = [] for cell in row.find_all('td'): row_elements.append(cell.text) table.append(row_elements) return pd.DataFrame(table[1:], columns=table[0])
[docs]def scrape_index(url: str='https://www.numbeo.com/pollution/rankings_by_country.jsp', columns: tuple = ('Country', 'Pollution')) -> None: """ Scrapes the pollution index from the numbeo website. :param url: url to scrape. :type url: str :param columns: columns to scrape. :type columns: tuple :return: pandas dataframe. :rtype: pd.DataFrame """ soup = scrape_urls.scrape_page(url) table = soup.find(lambda tag: tag.name == 'table' and tag.has_attr('id') and tag['id'] == "t2") rows = table.findAll(lambda tag: tag.name == 'tr') table = to_pandas_df(rows) table = table[columns] table.to_csv(f'{columns[-1].split()[0]} by Country.csv', index=False)
if __name__ == '__main__': scrape_index('https://www.numbeo.com/pollution/rankings_by_country.jsp', ['Country', 'Pollution Index']) scrape_index('https://www.numbeo.com/health-care/rankings_by_country.jsp', ['Country', 'Health Care Index']) scrape_index('https://www.numbeo.com/crime/rankings_by_country.jsp', ['Country', 'Crime Index', 'Safety Index'])