Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit925990e

Browse files
authored
Merge pull requestflairNLP#798 from bresslem/add-publisher-der-freitag
New publisher added: Der Freitag
2 parentsba61e0f +a70ce32 commit925990e

File tree

6 files changed

+232
-0
lines changed

6 files changed

+232
-0
lines changed

‎docs/supported_publishers.md‎

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1361,6 +1361,25 @@
13611361
<td>&#160;</td>
13621362
<td>&#160;</td>
13631363
</tr>
1364+
<tr>
1365+
<td>
1366+
<code>DerFreitag</code>
1367+
</td>
1368+
<td>
1369+
<div>der Freitag</div>
1370+
</td>
1371+
<td>
1372+
<a href="https://www.freitag.de/">
1373+
<span>www.freitag.de</span>
1374+
</a>
1375+
</td>
1376+
<td>
1377+
<code>de</code>
1378+
</td>
1379+
<td>&#160;</td>
1380+
<td>&#160;</td>
1381+
<td>&#160;</td>
1382+
</tr>
13641383
<tr>
13651384
<td>
13661385
<code>NetzpolitikOrg</code>

‎src/fundus/publishers/de/__init__.py‎

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from .boersenzeitungimportBoersenZeitungParser
1313
from .brimportBRParser
1414
from .business_insider_deimportBusinessInsiderDEParser
15+
from .der_freitagimportDerFreitagParser
1516
from .die_weltimportDieWeltParser
1617
from .die_zeitimportDieZeitParser
1718
from .dwimportDWParser
@@ -605,3 +606,13 @@ class DE(metaclass=PublisherGroup):
605606
Sitemap("https://www.gamestar.de/artikel_archiv_index.xml"),
606607
],
607608
)
609+
610+
DerFreitag=Publisher(
611+
name="der Freitag",
612+
domain="https://www.freitag.de/",
613+
parser=DerFreitagParser,
614+
sources=[
615+
RSSFeed("https://www.freitag.de/@@RSS"),
616+
Sitemap("https://www.freitag.de/sitemap.xml",sitemap_filter=inverse(regex_filter("sitemap-articles"))),
617+
],
618+
)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
fromdatetimeimportdate,datetime
2+
fromtypingimportList,Optional
3+
4+
fromlxml.cssselectimportCSSSelector
5+
fromlxml.etreeimportXPath
6+
7+
fromfundus.parserimportArticleBody,BaseParser,Image,ParserProxy,attribute
8+
fromfundus.parser.utilityimport (
9+
extract_article_body_with_selector,
10+
generic_author_parsing,
11+
generic_date_parsing,
12+
image_extraction,
13+
)
14+
15+
16+
classDerFreitagParser(ParserProxy):
17+
classV1(BaseParser):
18+
_summary_selector=CSSSelector("header > p.bc-article-intro__text")
19+
_paragraph_selector=CSSSelector("div.bo-article-text > p")
20+
_subheadline_selector=CSSSelector("div.bo-article-text > h2")
21+
22+
@attribute
23+
deftitle(self)->Optional[str]:
24+
returnself.precomputed.meta.get("og:title")
25+
26+
@attribute
27+
defbody(self)->Optional[ArticleBody]:
28+
returnextract_article_body_with_selector(
29+
self.precomputed.doc,
30+
summary_selector=self._summary_selector,
31+
subheadline_selector=self._subheadline_selector,
32+
paragraph_selector=self._paragraph_selector,
33+
)
34+
35+
@attribute
36+
defauthors(self)->List[str]:
37+
returngeneric_author_parsing(self.precomputed.ld.bf_search("author"))
38+
39+
@attribute
40+
defpublishing_date(self)->Optional[datetime]:
41+
returngeneric_date_parsing(self.precomputed.ld.bf_search("datePublished"))
42+
43+
@attribute
44+
deftopics(self)->List[str]:
45+
returnself.precomputed.ld.bf_search("keywords")
46+
47+
@attribute
48+
defimages(self)->List[Image]:
49+
returnimage_extraction(
50+
doc=self.precomputed.doc,
51+
paragraph_selector=self._paragraph_selector,
52+
upper_boundary_selector=CSSSelector("header.bc-article-intro"),
53+
lower_boundary_selector=CSSSelector("span.freitag-article-end"),
54+
image_selector=CSSSelector("figure img,div[role='figure'] img"),
55+
caption_selector=XPath("./ancestor::figure//figcaption//span[@class='bo-image__caption__desc']"),
56+
author_selector=XPath("./ancestor::figure//figcaption//span[@class='bo-image__caption__credit']"),
57+
)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp