Feb 18, 2016 · Dec 2, 2015
diff --git a/08_basic_email_web_crawler.py b/08_basic_email_web_crawler.py
 import requests
 import re
 try:
    from urllib.parse import urljoin
 except ImportError:
    from urlparse import urljoin

 # regex
 email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
 link_re =re.compile(r'href="(.*?)"')
 #get url
 #url=input('Enter a URL (include 'http://'):')--this is wrong
 url =input('Enter a URL (include `http://`):')


 def crawl(url):
 #connect to the url
 website=requests.get(url)

    result = set()
 #read html
 html=website.text

    req = requests.get(url)

    # Check if successful
    if(req.status_code != 200):
        return []
 #use re.findall to grab all the links
 links = re.findall('"((http|ftp)s?://.*?)"', html)

    # Find links
    links = link_re.findall(req.text)
 emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)

    print("\nFound {} links".format(len(links)))

    # Searchlinksfor emails
    for link inlinks:
 #prints the number oflinksin the list
 print("\nFound {}links".format(len(links)))

        # Get an absolute URL for a link
        link = urljoin(url, link)

        # Find all emails on current page
        result.update(email_re.findall(req.text))

    return result

 if __name__ == '__main__':
    emails = crawl('http://www.realpython.com')

    print("\nScrapped e-mail addresses:")
    for email in emails:
        print(email)
    print("\n")
 for email in emails:
 print(email)
Original file line number	Diff line number	Diff line change
		@@ -1,45 +1,26 @@
		import requests
		import re
		try:
		from urllib.parse import urljoin
		except ImportError:
		from urlparse import urljoin

		# regex
		email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')
		link_re =re.compile(r'href="(.*?)"')
		#get url
		#url=input('Enter a URL (include 'http://'):')--this is wrong
		url =input('Enter a URL (include `http://`):')


		def crawl(url):
		#connect to the url
		website=requests.get(url)

		result = set()
		#read html
		html=website.text

		req = requests.get(url)

		# Check if successful
		if(req.status_code != 200):
		return []
		#use re.findall to grab all the links
		links = re.findall('"((http\|ftp)s?://.*?)"', html)

		# Find links
		links = link_re.findall(req.text)
		emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html)

		print("\nFound {} links".format(len(links)))

		# Searchlinksfor emails
		for link inlinks:
		#prints the number oflinksin the list
		print("\nFound {}links".format(len(links)))

		# Get an absolute URL for a link
		link = urljoin(url, link)

		# Find all emails on current page
		result.update(email_re.findall(req.text))

		return result

		if __name__ == '__main__':
		emails = crawl('http://www.realpython.com')

		print("\nScrapped e-mail addresses:")
		for email in emails:
		print(email)
		print("\n")
		for email in emails:
		print(email)