Commit2760d68

committed

added javascript execution to link extractor tool

1 parentddf7fe5 commit2760d68Copy full SHA for 2760d68

File tree

1 file changed

+117

-0

lines changed

web-scraping/link-extractor
- link_extractor_js.py

1 file changed

+117

-0

lines changed

`‎web-scraping/link-extractor/link_extractor_js.py`

Lines changed: 117 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,117 @@`
	`1`	`+fromrequests_htmlimportHTMLSession`
	`2`	`+fromurllib.parseimporturlparse,urljoin`
	`3`	`+frombs4importBeautifulSoup`
	`4`	`+importcolorama`
	`5`	`+`
	`6`	`+# init the colorama module`
	`7`	`+colorama.init()`
	`8`	`+`
	`9`	`+GREEN=colorama.Fore.GREEN`
	`10`	`+GRAY=colorama.Fore.LIGHTBLACK_EX`
	`11`	`+RESET=colorama.Fore.RESET`
	`12`	`+`
	`13`	`+# initialize the set of links (unique links)`
	`14`	`+internal_urls=set()`
	`15`	`+external_urls=set()`
	`16`	`+`
	`17`	`+total_urls_visited=0`
	`18`	`+`
	`19`	`+`
	`20`	`+defis_valid(url):`
	`21`	`+"""`
	`22`	+ Checks whether `url` is a valid URL.
	`23`	`+ """`
	`24`	`+parsed=urlparse(url)`
	`25`	`+returnbool(parsed.netloc)andbool(parsed.scheme)`
	`26`	`+`
	`27`	`+`
	`28`	`+defget_all_website_links(url):`
	`29`	`+"""`
	`30`	+ Returns all URLs that is found on `url` in which it belongs to the same website
	`31`	`+ """`
	`32`	+# all URLs of `url`
	`33`	`+urls=set()`
	`34`	`+# domain name of the URL without the protocol`
	`35`	`+domain_name=urlparse(url).netloc`
	`36`	`+# initialize an HTTP session`
	`37`	`+session=HTMLSession()`
	`38`	`+# make HTTP request & retrieve response`
	`39`	`+response=session.get(url)`
	`40`	`+# execute Javascript`
	`41`	`+try:`
	`42`	`+response.html.render()`
	`43`	`+except:`
	`44`	`+pass`
	`45`	`+soup=BeautifulSoup(response.html.html,"html.parser")`
	`46`	`+fora_taginsoup.findAll("a"):`
	`47`	`+href=a_tag.attrs.get("href")`
	`48`	`+ifhref==""orhrefisNone:`
	`49`	`+# href empty tag`
	`50`	`+continue`
	`51`	`+# join the URL if it's relative (not absolute link)`
	`52`	`+href=urljoin(url,href)`
	`53`	`+parsed_href=urlparse(href)`
	`54`	`+# remove URL GET parameters, URL fragments, etc.`
	`55`	`+href=parsed_href.scheme+"://"+parsed_href.netloc+parsed_href.path`
	`56`	`+ifnotis_valid(href):`
	`57`	`+# not a valid URL`
	`58`	`+continue`
	`59`	`+ifhrefininternal_urls:`
	`60`	`+# already in the set`
	`61`	`+continue`
	`62`	`+ifdomain_namenotinhref:`
	`63`	`+# external link`
	`64`	`+ifhrefnotinexternal_urls:`
	`65`	`+print(f"{GRAY}[!] External link:{href}{RESET}")`
	`66`	`+external_urls.add(href)`
	`67`	`+continue`
	`68`	`+print(f"{GREEN}[*] Internal link:{href}{RESET}")`
	`69`	`+urls.add(href)`
	`70`	`+internal_urls.add(href)`
	`71`	`+returnurls`
	`72`	`+`
	`73`	`+`
	`74`	`+defcrawl(url,max_urls=50):`
	`75`	`+"""`
	`76`	`+ Crawls a web page and extracts all links.`
	`77`	+ You'll find all links in `external_urls` and `internal_urls` global set variables.
	`78`	`+ params:`
	`79`	`+ max_urls (int): number of max urls to crawl, default is 30.`
	`80`	`+ """`
	`81`	`+globaltotal_urls_visited`
	`82`	`+total_urls_visited+=1`
	`83`	`+links=get_all_website_links(url)`
	`84`	`+forlinkinlinks:`
	`85`	`+iftotal_urls_visited>max_urls:`
	`86`	`+break`
	`87`	`+crawl(link,max_urls=max_urls)`
	`88`	`+`
	`89`	`+`
	`90`	`+if__name__=="__main__":`
	`91`	`+importargparse`
	`92`	`+parser=argparse.ArgumentParser(description="Link Extractor Tool with Python")`
	`93`	`+parser.add_argument("url",help="The URL to extract links from.")`
	`94`	`+parser.add_argument("-m","--max-urls",help="Number of max URLs to crawl, default is 30.",default=30,type=int)`
	`95`	`+`
	`96`	`+args=parser.parse_args()`
	`97`	`+url=args.url`
	`98`	`+max_urls=args.max_urls`
	`99`	`+`
	`100`	`+crawl(url,max_urls=max_urls)`
	`101`	`+`
	`102`	`+print("[+] Total Internal links:",len(internal_urls))`
	`103`	`+print("[+] Total External links:",len(external_urls))`
	`104`	`+print("[+] Total URLs:",len(external_urls)+len(internal_urls))`
	`105`	`+`
	`106`	`+domain_name=urlparse(url).netloc`
	`107`	`+`
	`108`	`+# save the internal links to a file`
	`109`	`+withopen(f"{domain_name}_internal_links.txt","w")asf:`
	`110`	`+forinternal_linkininternal_urls:`
	`111`	`+print(internal_link.strip(),file=f)`
	`112`	`+`
	`113`	`+# save the external links to a file`
	`114`	`+withopen(f"{domain_name}_external_links.txt","w")asf:`
	`115`	`+forexternal_linkinexternal_urls:`
	`116`	`+print(external_link.strip(),file=f)`
	`117`	`+`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit2760d68

File tree

1 file changed

1 file changed

`‎web-scraping/link-extractor/link_extractor_js.py`

0 commit comments