|
5 | 5 | fromglobimportglob |
6 | 6 | importjson |
7 | 7 | fromtimeimporttime |
| 8 | +importdatetime |
| 9 | +importdifflib |
| 10 | +importrequests |
8 | 11 | importnumpyasnp |
9 | | -fromscipy.ndimageimportgaussian_filter |
10 | 12 | fromtqdmimporttqdm |
11 | 13 |
|
12 | 14 | importutil |
13 | 15 |
|
14 | | -parser=argparse.ArgumentParser(description="Webpage monitor.") |
| 16 | +parser=argparse.ArgumentParser(description='Webpage monitor.') |
15 | 17 | parser.add_argument('--roster_json', |
16 | 18 | type=str, |
17 | 19 | default='./roster.json', |
18 | | -help="path to the roster") |
| 20 | +help='path to the roster') |
19 | 21 | parser.add_argument('--check_every', |
20 | 22 | type=int, |
21 | 23 | default=43200, |
22 | | -help="check every N seconds") |
| 24 | +help='check every N seconds') |
23 | 25 | parser.add_argument('--exit_after', |
24 | 26 | type=int, |
25 | 27 | default=None, |
26 | | -help="quit after N seconds") |
27 | | -parser.add_argument('--tmp_dir', |
| 28 | +help='quit after N seconds') |
| 29 | +parser.add_argument('--snapshot_dir', |
28 | 30 | type=str, |
29 | | -default='/tmp/webpage-monitor', |
30 | | -help="directory to dump screenshots for comparison") |
| 31 | +default='./snapshots', |
| 32 | +help='directory to dump screenshots for comparison') |
31 | 33 | parser.add_argument('--clear_cached', |
32 | 34 | action='store_true', |
33 | | -help="whether to clear the screenshots on disk") |
| 35 | +help='whether to clear the screenshots on disk') |
34 | 36 |
|
35 | 37 |
|
36 | 38 | defmain(args): |
37 | | -ifargs.exit_afterisNone: |
38 | | -exit_after=np.inf |
39 | | -else: |
40 | | -exit_after=args.exit_after |
| 39 | +exit_after=np.infifargs.exit_afterisNoneelseargs.exit_after |
41 | 40 |
|
42 | | -roster=load_roster(args.roster_json) |
| 41 | +withopen(args.roster_json,'rb')asfile_handle: |
| 42 | +roster=json.load(file_handle) |
43 | 43 |
|
44 | 44 | start_t=time() |
45 | 45 | last_check_t=0 |
46 | 46 |
|
47 | | -ifargs.clear_cachedandexists(args.tmp_dir): |
48 | | -rmtree(args.tmp_dir) |
| 47 | +ifargs.clear_cachedandexists(args.snapshot_dir): |
| 48 | +rmtree(args.snapshot_dir) |
49 | 49 |
|
50 | 50 | whileTrue: |
51 | 51 | iftime()-last_check_t>args.check_every: |
52 | 52 | changed,deltas= [], [] |
53 | 53 |
|
54 | | -forurl,optintqdm(roster.items(),desc="Checking URLs"): |
55 | | -out_dir=join(args.tmp_dir, |
56 | | -replace_special_char(url)).rstrip('/') |
57 | | - |
58 | | -# Take screenshots |
59 | | -screenshot(url,out_dir,opt) |
60 | | - |
61 | | -pngs=sorted(glob(join(out_dir,'*.png'))) |
62 | | - |
63 | | -# Compare with previous screenshots |
64 | | -iflen(pngs)>1: |
65 | | -delta_png=out_dir+'_delta.png' |
66 | | -delta=diff_screenshots(*pngs[-2:],delta_png) |
67 | | -ifdeltaisnotNone: |
| 54 | +forurl,optintqdm(roster.items(),desc='Checking URLs'): |
| 55 | +# Snapshot the current webpage. |
| 56 | +out_dir=join(args.snapshot_dir, |
| 57 | +util.folder_name_from_url(url)) |
| 58 | +snapshot(url,out_dir,opt) |
| 59 | + |
| 60 | +# Compare with the previous snapshot. |
| 61 | +snapshot_paths=sorted( |
| 62 | +glob(join(out_dir,'????_??_??_??_??_??.html'))) |
| 63 | +iflen(snapshot_paths)>1: |
| 64 | +delta=diff_snapshots(snapshot_paths[-2], |
| 65 | +snapshot_paths[-1],out_dir,opt) |
| 66 | +ifdelta!='': |
68 | 67 | changed.append(url) |
69 | 68 | deltas.append(delta) |
70 | 69 |
|
71 | 70 | # Remove earlier screenshots to avoid storage explosion |
72 | | -iflen(pngs)>2: |
73 | | -forfinpngs[:-2]: |
74 | | -remove(f) |
| 71 | +iflen(snapshot_paths)>2: |
| 72 | +forxinsnapshot_paths[:-2]: |
| 73 | +remove(x) |
75 | 74 |
|
76 | 75 | last_check_t=time() |
77 | 76 |
|
78 | 77 | # Email myself the results |
79 | 78 | ifchanged: |
80 | 79 | msg='' |
81 | 80 | forurl,deltainzip(changed,deltas): |
82 | | -msg+="file://{delta}\n{url}\n\n".format(delta=delta, |
83 | | -url=url) |
84 | | -util.email_myself(msg,subject="Webpage Monitor") |
85 | | -util.format_print("Change detected; email sent",'header') |
86 | | - |
87 | | -iftime()-start_t>exit_after: |
88 | | -break |
89 | | - |
90 | | - |
91 | | -defdiff_screenshots(old_png, |
92 | | -new_png, |
93 | | -delta_png, |
94 | | -pix_diff_thres=0.1, |
95 | | -n_diff_thres=16, |
96 | | -unchanged_alpha=0.2, |
97 | | -diff_blur_sigma=4): |
98 | | -old=util.imread_arr(old_png) |
99 | | -new=util.imread_arr(new_png) |
100 | | - |
101 | | -# Sizes are even different |
102 | | -ifold.shape!=new.shape: |
103 | | -util.imwrite_arr(new,delta_png) |
104 | | -returndelta_png |
105 | | - |
106 | | -# Check content |
107 | | -pixel_is_diff=np.abs(old-new)>=pix_diff_thres# (H, W, 3) |
108 | | -pixel_is_diff=np.sum(pixel_is_diff,axis=2)>0 |
109 | | - |
110 | | -# Not enough different pixels for a change |
111 | | -ifnp.sum(pixel_is_diff)<=n_diff_thres: |
112 | | -returnNone |
113 | | - |
114 | | -# Highlight the changed part |
115 | | -alpha=unchanged_alpha*np.ones_like(new) |
116 | | -alpha[np.dstack([pixel_is_diff]*3)]=1 |
117 | | -alpha=gaussian_filter(alpha,diff_blur_sigma) |
118 | | -delta=alpha*new+ (1-alpha)*np.zeros_like(new) |
119 | | -util.imwrite_arr(delta,delta_png) |
120 | | -returndelta_png |
121 | | - |
122 | | - |
123 | | -defscreenshot(url,out_dir,opt,width=512,delay=3): |
| 81 | +msg+=f'{url}\n{delta}\n\n' |
| 82 | +util.email_myself(msg,subject='Webpage Monitor') |
| 83 | +util.format_print('Change detected; email sent','header') |
| 84 | + |
| 85 | +iftime()-start_t>exit_after: |
| 86 | +break |
| 87 | + |
| 88 | + |
| 89 | +defdiff_snapshots(html0_path,html1_path,out_dir,opt): |
| 90 | +# TODO: Handle opt (page-specific special options) |
| 91 | +html0_content=util.read_file(html0_path) |
| 92 | +html1_content=util.read_file(html1_path) |
| 93 | +delta=difflib.ndiff(html0_content.split('\n'),html1_content.split('\n')) |
| 94 | +# Keep differences only. |
| 95 | +delta='\n'.join(xforxindelta |
| 96 | +ifx.startswith('+ ')orx.startswith('- ')) |
| 97 | +delta_path=join(out_dir,'delta.html') |
| 98 | +util.write_file(delta,delta_path) |
| 99 | +returndelta |
| 100 | + |
| 101 | + |
| 102 | +defsnapshot(url,out_dir,opt): |
| 103 | +# TODO: Ditto |
| 104 | +request=requests.get(url) |
| 105 | +print(url) |
| 106 | +html_src=request.content.decode() |
124 | 107 | ifnotexists(out_dir): |
125 | 108 | makedirs(out_dir) |
126 | | - |
127 | | -cmd= ('webkit2png --fullsize --no-images --ignore-ssl-check --width={w} ' |
128 | | -'--delay={delay} --dir={dir_} --filename={t} {url}').format( |
129 | | -w=width,delay=delay,dir_=out_dir,t=time(),url=url) |
130 | | -util.call(cmd,silence_stdout=True) |
131 | | - |
132 | | - |
133 | | -defload_roster(roster_json): |
134 | | -withopen(roster_json,'r')ash: |
135 | | -roster=json.load(h) |
136 | | -returnroster |
137 | | - |
138 | | - |
139 | | -defreplace_special_char(url): |
140 | | -returnurl.replace('/','_').replace('?', |
141 | | -'_').replace('&', |
142 | | -'_').replace(':','_') |
| 109 | +timestamp=datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') |
| 110 | +html_path=join(out_dir,timestamp+'.html') |
| 111 | +util.write_file(html_src,html_path) |
143 | 112 |
|
144 | 113 |
|
145 | 114 | if__name__=='__main__': |
|