Code forHow to Convert HTML Tables into CSV Files in Python Tutorial

html_table_extractor.py

import requestsimport pandas as pdfrom bs4 import BeautifulSoup as bsUSER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"# US englishLANGUAGE = "en-US,en;q=0.5"def get_soup(url):    """Constructs and returns a soup using the HTML content of `url` passed"""    # initialize a session    session = requests.Session()    # set the User-Agent as a regular browser    session.headers['User-Agent'] = USER_AGENT    # request for english content (optional)    session.headers['Accept-Language'] = LANGUAGE    session.headers['Content-Language'] = LANGUAGE    # make the request    html = session.get(url)    # return the soup    return bs(html.content, "html.parser")def get_all_tables(soup):    """Extracts and returns all tables in a soup object"""    return soup.find_all("table")def get_table_headers(table):    """Given a table soup, returns all the headers"""    headers = []    for th in table.find("tr").find_all("th"):        headers.append(th.text.strip())    return headersdef get_table_rows(table):    """Given a table, returns all its rows"""    rows = []    for tr in table.find_all("tr")[1:]:        cells = []        # grab all td tags in this table row        tds = tr.find_all("td")        if len(tds) == 0:            # if no td tags, search for th tags            # can be found especially in wikipedia tables below the table            ths = tr.find_all("th")            for th in ths:                cells.append(th.text.strip())        else:            # use regular td tags            for td in tds:                cells.append(td.text.strip())        rows.append(cells)    return rowsdef save_as_csv(table_name, headers, rows):    pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")def main(url):    # get the soup    soup = get_soup(url)    # extract all the tables from the web page    tables = get_all_tables(soup)    print(f"[+] Found a total of {len(tables)} tables.")    # iterate over all tables    for i, table in enumerate(tables, start=1):        # get the table headers        headers = get_table_headers(table)        # get all the rows of the table        rows = get_table_rows(table)        # save table as csv file        table_name = f"table-{i}"        print(f"[+] Saving {table_name}")        save_as_csv(table_name, headers, rows)if __name__ == "__main__":    import sys    try:        url = sys.argv[1]    except IndexError:        print("Please specify a URL.\nUsage: python html_table_extractor.py [URL]")        exit(1)    main(url)

Ethical Hacking with Python EBook - Topic - Top

New Tutorials

Building a Full-Stack RAG Chatbot with FastAPI, OpenAI, and Streamlit

How to Recover Deleted Files with Python

How to Use Python to Track Google Search Results and Reviews Over Time

YouTube Video Transcription Summarization with Python

Getting Started with Python for SaaS Applications

Movatterモバイル変換

Code forHow to Convert HTML Tables into CSV Files in Python Tutorial

Tags

New Tutorials

Popular Tutorials