|
23 | 23 | from __future__importannotations
|
24 | 24 |
|
25 | 25 | importargparse
|
| 26 | +importconcurrent.futures |
26 | 27 | importdataclasses
|
27 | 28 | importdatetimeasdt
|
28 | 29 | importfilecmp
|
@@ -1249,21 +1250,41 @@ def proofread_canonicals(
|
1249 | 1250 | /3/whatsnew/3.11.html, which may not exist yet.
|
1250 | 1251 | """
|
1251 | 1252 | logging.info("Checking canonical links...")
|
1252 |
| -canonical_re=re.compile( |
1253 |
| -"""<link rel="canonical" href="https://docs.python.org/([^"]*)" />""" |
1254 |
| - ) |
1255 |
| -forfileinwww_root.glob("**/*.html"): |
1256 |
| -html=file.read_text(encoding="UTF-8",errors="surrogateescape") |
1257 |
| -canonical=canonical_re.search(html) |
1258 |
| -ifnotcanonical: |
1259 |
| -continue |
1260 |
| -target=canonical.group(1) |
1261 |
| -ifnot (www_root/target).exists(): |
1262 |
| -logging.info("Removing broken canonical from %s to %s",file,target) |
1263 |
| -html=html.replace(canonical.group(0),"") |
1264 |
| -file.write_text(html,encoding="UTF-8",errors="surrogateescape") |
1265 |
| -ifnotskip_cache_invalidation: |
1266 |
| -purge(http,str(file).replace("/srv/docs.python.org/","")) |
| 1253 | +worker_count= (os.cpu_count()or1)+2 |
| 1254 | +withconcurrent.futures.ThreadPoolExecutor(worker_count)asexecutor: |
| 1255 | +futures= { |
| 1256 | +executor.submit(_check_canonical_rel,file,www_root) |
| 1257 | +forfileinwww_root.glob("**/*.html") |
| 1258 | + } |
| 1259 | +paths_to_purge= { |
| 1260 | +res.relative_to(www_root)# strip the leading /srv/docs.python.org |
| 1261 | +forfutinconcurrent.futures.as_completed(futures) |
| 1262 | +if (res:=fut.result())isnotNone |
| 1263 | + } |
| 1264 | +ifnotskip_cache_invalidation: |
| 1265 | +purge(http,*paths_to_purge) |
| 1266 | + |
| 1267 | + |
| 1268 | +def_check_canonical_rel(file:Path,www_root:Path): |
| 1269 | +# Check for a canonical relation link in the HTML. |
| 1270 | +# If one exists, ensure that the target exists |
| 1271 | +# or otherwise remove the canonical link element. |
| 1272 | +prefix=b'<link rel="canonical" href="https://docs.python.org/' |
| 1273 | +suffix=b'" />' |
| 1274 | +pfx_len=len(prefix) |
| 1275 | +sfx_len=len(suffix) |
| 1276 | +html=file.read_bytes() |
| 1277 | +try: |
| 1278 | +start=html.index(prefix) |
| 1279 | +end=html.index(suffix,start+pfx_len) |
| 1280 | +exceptValueError: |
| 1281 | +returnNone |
| 1282 | +target=html[start+pfx_len :end].decode(errors="surrogateescape") |
| 1283 | +if (www_root/target).exists(): |
| 1284 | +returnNone |
| 1285 | +logging.info("Removing broken canonical from %s to %s",file,target) |
| 1286 | +file.write_bytes(html[:start]+html[end+sfx_len :]) |
| 1287 | +returnfile |
1267 | 1288 |
|
1268 | 1289 |
|
1269 | 1290 | defpurge(http:urllib3.PoolManager,*paths:Path|str)->None:
|
|