1+ from requests_html import HTMLSession
2+ from urllib .parse import urlparse ,urljoin
3+ from bs4 import BeautifulSoup
4+ import colorama
5+
6+ # init the colorama module
7+ colorama .init ()
8+
9+ GREEN = colorama .Fore .GREEN
10+ GRAY = colorama .Fore .LIGHTBLACK_EX
11+ RESET = colorama .Fore .RESET
12+
13+ # initialize the set of links (unique links)
14+ internal_urls = set ()
15+ external_urls = set ()
16+
17+ total_urls_visited = 0
18+
19+
20+ def is_valid (url ):
21+ """
22+ Checks whether `url` is a valid URL.
23+ """
24+ parsed = urlparse (url )
25+ return bool (parsed .netloc )and bool (parsed .scheme )
26+
27+
28+ def get_all_website_links (url ):
29+ """
30+ Returns all URLs that is found on `url` in which it belongs to the same website
31+ """
32+ # all URLs of `url`
33+ urls = set ()
34+ # domain name of the URL without the protocol
35+ domain_name = urlparse (url ).netloc
36+ # initialize an HTTP session
37+ session = HTMLSession ()
38+ # make HTTP request & retrieve response
39+ response = session .get (url )
40+ # execute Javascript
41+ try :
42+ response .html .render ()
43+ except :
44+ pass
45+ soup = BeautifulSoup (response .html .html ,"html.parser" )
46+ for a_tag in soup .findAll ("a" ):
47+ href = a_tag .attrs .get ("href" )
48+ if href == "" or href is None :
49+ # href empty tag
50+ continue
51+ # join the URL if it's relative (not absolute link)
52+ href = urljoin (url ,href )
53+ parsed_href = urlparse (href )
54+ # remove URL GET parameters, URL fragments, etc.
55+ href = parsed_href .scheme + "://" + parsed_href .netloc + parsed_href .path
56+ if not is_valid (href ):
57+ # not a valid URL
58+ continue
59+ if href in internal_urls :
60+ # already in the set
61+ continue
62+ if domain_name not in href :
63+ # external link
64+ if href not in external_urls :
65+ print (f"{ GRAY } [!] External link:{ href } { RESET } " )
66+ external_urls .add (href )
67+ continue
68+ print (f"{ GREEN } [*] Internal link:{ href } { RESET } " )
69+ urls .add (href )
70+ internal_urls .add (href )
71+ return urls
72+
73+
74+ def crawl (url ,max_urls = 50 ):
75+ """
76+ Crawls a web page and extracts all links.
77+ You'll find all links in `external_urls` and `internal_urls` global set variables.
78+ params:
79+ max_urls (int): number of max urls to crawl, default is 30.
80+ """
81+ global total_urls_visited
82+ total_urls_visited += 1
83+ links = get_all_website_links (url )
84+ for link in links :
85+ if total_urls_visited > max_urls :
86+ break
87+ crawl (link ,max_urls = max_urls )
88+
89+
90+ if __name__ == "__main__" :
91+ import argparse
92+ parser = argparse .ArgumentParser (description = "Link Extractor Tool with Python" )
93+ parser .add_argument ("url" ,help = "The URL to extract links from." )
94+ parser .add_argument ("-m" ,"--max-urls" ,help = "Number of max URLs to crawl, default is 30." ,default = 30 ,type = int )
95+
96+ args = parser .parse_args ()
97+ url = args .url
98+ max_urls = args .max_urls
99+
100+ crawl (url ,max_urls = max_urls )
101+
102+ print ("[+] Total Internal links:" ,len (internal_urls ))
103+ print ("[+] Total External links:" ,len (external_urls ))
104+ print ("[+] Total URLs:" ,len (external_urls )+ len (internal_urls ))
105+
106+ domain_name = urlparse (url ).netloc
107+
108+ # save the internal links to a file
109+ with open (f"{ domain_name } _internal_links.txt" ,"w" )as f :
110+ for internal_link in internal_urls :
111+ print (internal_link .strip (),file = f )
112+
113+ # save the external links to a file
114+ with open (f"{ domain_name } _external_links.txt" ,"w" )as f :
115+ for external_link in external_urls :
116+ print (external_link .strip (),file = f )
117+