Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit7683043

Browse files
committed
fix issue wrongly identifying internal urls as external urls in the link extractor tutorial
1 parent06c7cef commit7683043

File tree

2 files changed

+3
-9
lines changed

2 files changed

+3
-9
lines changed

‎web-scraping/link-extractor/link_extractor.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ def get_all_website_links(url):
3232
"""
3333
# all URLs of `url`
3434
urls=set()
35-
# domain name of the URL without the protocol
36-
domain_name=urlparse(url).netloc
3735
soup=BeautifulSoup(requests.get(url).content,"html.parser")
3836
fora_taginsoup.findAll("a"):
3937
href=a_tag.attrs.get("href")
@@ -89,16 +87,15 @@ def crawl(url, max_urls=30):
8987
args=parser.parse_args()
9088
url=args.url
9189
max_urls=args.max_urls
92-
90+
# domain name of the URL without the protocol
91+
domain_name=urlparse(url).netloc
9392
crawl(url,max_urls=max_urls)
9493

9594
print("[+] Total Internal links:",len(internal_urls))
9695
print("[+] Total External links:",len(external_urls))
9796
print("[+] Total URLs:",len(external_urls)+len(internal_urls))
9897
print("[+] Total crawled URLs:",max_urls)
9998

100-
domain_name=urlparse(url).netloc
101-
10299
# save the internal links to a file
103100
withopen(f"{domain_name}_internal_links.txt","w")asf:
104101
forinternal_linkininternal_urls:

‎web-scraping/link-extractor/link_extractor_js.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ def get_all_website_links(url):
3232
"""
3333
# all URLs of `url`
3434
urls=set()
35-
# domain name of the URL without the protocol
36-
domain_name=urlparse(url).netloc
3735
# initialize an HTTP session
3836
session=HTMLSession()
3937
# make HTTP request & retrieve response
@@ -98,15 +96,14 @@ def crawl(url, max_urls=30):
9896
args=parser.parse_args()
9997
url=args.url
10098
max_urls=args.max_urls
101-
99+
domain_name=urlparse(url).netloc
102100
crawl(url,max_urls=max_urls)
103101

104102
print("[+] Total Internal links:",len(internal_urls))
105103
print("[+] Total External links:",len(external_urls))
106104
print("[+] Total URLs:",len(external_urls)+len(internal_urls))
107105
print("[+] Total crawled URLs:",max_urls)
108106

109-
domain_name=urlparse(url).netloc
110107

111108
# save the internal links to a file
112109
withopen(f"{domain_name}_internal_links.txt","w")asf:

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp