Movatterモバイル変換


[0]ホーム

URL:



Code forHow to Convert HTML Tables into CSV Files in Python Tutorial


View on Github

html_table_extractor.py

import requestsimport pandas as pdfrom bs4 import BeautifulSoup as bsUSER_AGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"# US englishLANGUAGE = "en-US,en;q=0.5"def get_soup(url):    """Constructs and returns a soup using the HTML content of `url` passed"""    # initialize a session    session = requests.Session()    # set the User-Agent as a regular browser    session.headers['User-Agent'] = USER_AGENT    # request for english content (optional)    session.headers['Accept-Language'] = LANGUAGE    session.headers['Content-Language'] = LANGUAGE    # make the request    html = session.get(url)    # return the soup    return bs(html.content, "html.parser")def get_all_tables(soup):    """Extracts and returns all tables in a soup object"""    return soup.find_all("table")def get_table_headers(table):    """Given a table soup, returns all the headers"""    headers = []    for th in table.find("tr").find_all("th"):        headers.append(th.text.strip())    return headersdef get_table_rows(table):    """Given a table, returns all its rows"""    rows = []    for tr in table.find_all("tr")[1:]:        cells = []        # grab all td tags in this table row        tds = tr.find_all("td")        if len(tds) == 0:            # if no td tags, search for th tags            # can be found especially in wikipedia tables below the table            ths = tr.find_all("th")            for th in ths:                cells.append(th.text.strip())        else:            # use regular td tags            for td in tds:                cells.append(td.text.strip())        rows.append(cells)    return rowsdef save_as_csv(table_name, headers, rows):    pd.DataFrame(rows, columns=headers).to_csv(f"{table_name}.csv")def main(url):    # get the soup    soup = get_soup(url)    # extract all the tables from the web page    tables = get_all_tables(soup)    print(f"[+] Found a total of {len(tables)} tables.")    # iterate over all tables    for i, table in enumerate(tables, start=1):        # get the table headers        headers = get_table_headers(table)        # get all the rows of the table        rows = get_table_rows(table)        # save table as csv file        table_name = f"table-{i}"        print(f"[+] Saving {table_name}")        save_as_csv(table_name, headers, rows)if __name__ == "__main__":    import sys    try:        url = sys.argv[1]    except IndexError:        print("Please specify a URL.\nUsage: python html_table_extractor.py [URL]")        exit(1)    main(url)

Ethical Hacking with Python EBook - Topic - Top


Join 50,000+ Python Programmers & Enthusiasts like you!



Tags

Practical Python PDF Processing EBook - About - Middle


New Tutorials

Popular Tutorials


Ethical Hacking with Python EBook - About - Bottom







[8]ページ先頭

©2009-2025 Movatter.jp