|
1 | 1 | importrequests
|
2 | 2 | importre
|
3 |
| -try: |
4 |
| -fromurllib.parseimporturljoin |
5 |
| -exceptImportError: |
6 |
| -fromurlparseimporturljoin |
7 | 3 |
|
8 |
| -# regex |
9 |
| -email_re=re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)') |
10 |
| -link_re=re.compile(r'href="(.*?)"') |
| 4 | +#get url |
| 5 | +#url=input('Enter a URL (include 'http://'):')--this is wrong |
| 6 | +url=input('Enter a URL (include `http://`):') |
11 | 7 |
|
12 | 8 |
|
13 |
| -defcrawl(url): |
| 9 | +#connect to the url |
| 10 | +website=requests.get(url) |
14 | 11 |
|
15 |
| -result=set() |
| 12 | +#read html |
| 13 | +html=website.text |
16 | 14 |
|
17 |
| -req=requests.get(url) |
18 | 15 |
|
19 |
| -# Check if successful |
20 |
| -if(req.status_code!=200): |
21 |
| -return [] |
| 16 | +#use re.findall to grab all the links |
| 17 | +links=re.findall('"((http|ftp)s?://.*?)"',html) |
22 | 18 |
|
23 |
| -# Find links |
24 |
| -links=link_re.findall(req.text) |
| 19 | +emails=re.findall('([\w\.,]+@[\w\.,]+\.\w+)',html) |
25 | 20 |
|
26 |
| -print("\nFound {} links".format(len(links))) |
27 | 21 |
|
28 |
| -# Searchlinksfor emails |
29 |
| -forlinkinlinks: |
| 22 | +#prints the number oflinksin the list |
| 23 | +print("\nFound {}links".format(len(links))) |
30 | 24 |
|
31 |
| -# Get an absolute URL for a link |
32 |
| -link=urljoin(url,link) |
33 |
| - |
34 |
| -# Find all emails on current page |
35 |
| -result.update(email_re.findall(req.text)) |
36 |
| - |
37 |
| -returnresult |
38 |
| - |
39 |
| -if__name__=='__main__': |
40 |
| -emails=crawl('http://www.realpython.com') |
41 |
| - |
42 |
| -print("\nScrapped e-mail addresses:") |
43 |
| -foremailinemails: |
44 |
| -print(email) |
45 |
| -print("\n") |
| 25 | +foremailinemails: |
| 26 | +print(email) |