Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf0d92c8

Browse files
authored
scraper: Updated Little Caprice Scraper (#1751)
1 parentb5e3fa5 commitf0d92c8

File tree

1 file changed

+38
-47
lines changed

1 file changed

+38
-47
lines changed

‎pkg/scrape/littlecaprice.go

Lines changed: 38 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
package scrape
22

33
import (
4-
"regexp"
5-
"strconv"
4+
"net/url"
65
"strings"
76
"sync"
87
"time"
@@ -23,12 +22,7 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
2322
siteCollector:=createCollector("www.littlecaprice-dreams.com")
2423
galleryCollector:=cloneCollector(sceneCollector)
2524

26-
// RegEx Patterns
27-
coverRegEx:=regexp.MustCompile(`\.vid_bg {\nbackground: url\('(.+?)'`)
28-
durationRegEx:=regexp.MustCompile(`(\d+):(\d+)`)
29-
descriptionRegEx:=regexp.MustCompile(`(?i)^e(?:nglish)?:`)
30-
31-
sceneCollector.OnHTML(`article.project`,func(e*colly.HTMLElement) {
25+
sceneCollector.OnHTML(`html`,func(e*colly.HTMLElement) {
3226
sc:= models.ScrapedScene{}
3327
sc.ScraperID=scraperID
3428
sc.SceneType="VR"
@@ -37,52 +31,52 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
3731
sc.HomepageURL=strings.Split(e.Request.URL.String(),"?")[0]
3832

3933
// Scene ID - Generate randomly
40-
sc.SiteID=strings.Split(e.Attr("id"),"-")[1]
34+
e.ForEach(`link[rel="shortlink"]`,func(idint,e*colly.HTMLElement) {
35+
link:=e.Request.AbsoluteURL(e.Attr("href"))
36+
tmpurl,_:=url.Parse(link)
37+
sc.SiteID=tmpurl.Query().Get("p")
38+
})
4139
sc.SceneID=slugify.Slugify(sc.Site+"-"+sc.SiteID)
4240

4341
// Title
44-
sc.Title=strings.TrimSpace(e.ChildText(`.vid_title`))
42+
e.ForEach(`.project-header h1`,func(idint,e*colly.HTMLElement) {
43+
ifid==0 {
44+
sc.Title=strings.TrimSpace(e.Text)
45+
}
46+
})
4547

4648
// Cover
47-
cover:=e.Request.Ctx.GetAny("cover").(string)
48-
iflen(cover)==0 {
49-
cover=coverRegEx.FindStringSubmatch(e.DOM.Find(`style`).Text())[1]
50-
}
51-
cover=strings.Replace(cover,"media.","",-1)
52-
sc.Covers=append(sc.Covers,e.Request.AbsoluteURL(cover))
49+
e.ForEach(`meta[name="og:image"]`,func(idint,e*colly.HTMLElement) {
50+
ifid==0 {
51+
sc.Covers=append(sc.Covers,strings.Split(e.Request.AbsoluteURL(e.Attr("content")),"?")[0])
52+
}
53+
})
5354

5455
// Duration
55-
minutes:=durationRegEx.FindStringSubmatch(e.ChildText(`.vid_length`))[1]
56-
sc.Duration,_=strconv.Atoi(minutes)
5756

5857
// Released
59-
dt,_:=time.Parse("January 2, 2006",e.ChildText(`.vid_date`))
60-
sc.Released=dt.Format("2006-01-02")
58+
e.ForEach(`meta[name="og:published_time"]`,func(idint,e*colly.HTMLElement) {
59+
dt,_:=time.Parse("2006-01-02",e.Attr("content")[:10])
60+
sc.Released=dt.Format("2006-01-02")
61+
})
6162

6263
// Synopsis
63-
sc.Synopsis=strings.TrimSpace(
64-
descriptionRegEx.ReplaceAllString(// Some scene descriptions include a redundant prefix. We remove it.
65-
e.ChildText(`.vid_desc`),""))
64+
e.ForEach(`.desc-text`,func(idint,e*colly.HTMLElement) {
65+
sc.Synopsis=strings.TrimSpace(e.Text)
66+
})
6667

6768
// Cast and tags
68-
e.ForEach(`.vid_infos .vid_info_content a`,func(idint,e*colly.HTMLElement) {
69-
ife.Attr("rel")=="tag" {
70-
sc.Tags=append(sc.Tags,strings.TrimSpace(e.Text))
71-
}else {
72-
sc.Cast=append(sc.Cast,strings.TrimSpace(e.Text))
73-
}
69+
e.ForEach(`.project-models .list a`,func(idint,e*colly.HTMLElement) {
70+
sc.Cast=append(sc.Cast,strings.TrimSpace(e.Text))
7471
})
7572

76-
// Gallery
77-
galleryPage,_:=e.DOM.Find(`.vid_buttons a[href*="project"]`).Attr("href")
78-
ctx:=colly.NewContext()
79-
ctx.Put("scene",sc)
80-
81-
galleryCollector.Request("GET",galleryPage,nil,ctx,nil)
73+
// Tags
74+
e.ForEach(`meta[name="og:video:tag"]`,func(idint,e*colly.HTMLElement) {
75+
sc.Tags=append(sc.Tags,e.Attr("content"))
76+
})
8277

83-
ifgalleryPage=="" {
84-
out<-sc
85-
}
78+
// Gallery
79+
out<-sc
8680
})
8781

8882
galleryCollector.OnHTML(`html`,func(e*colly.HTMLElement) {
@@ -96,16 +90,13 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
9690
out<-sc
9791
})
9892

99-
siteCollector.OnHTML(`.et_pb_portfolio_item`,func(e*colly.HTMLElement) {
100-
sceneURL:=e.Request.AbsoluteURL(e.ChildAttr(`a`,"href"))
93+
siteCollector.OnHTML(`.project-preview`,func(e*colly.HTMLElement) {
94+
sceneURL:=e.Request.AbsoluteURL(e.Attr("href"))
10195

10296
// If scene exists in database, there's no need to scrape
10397
if!funk.ContainsString(knownScenes,sceneURL) {
104-
ctx:=colly.NewContext()
105-
ctx.Put("cover",e.ChildAttr("img","src"))
106-
10798
//sceneCollector.Visit(sceneURL)
108-
sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
99+
sceneCollector.Request("GET",sceneURL,nil,nil,nil)
109100
}
110101
})
111102

@@ -114,7 +105,7 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
114105
ctx.Put("cover","")
115106
sceneCollector.Request("GET",singleSceneURL,nil,ctx,nil)
116107
}else {
117-
siteCollector.Visit("https://www.littlecaprice-dreams.com/virtual-reality-little-caprice-dreams/")
108+
siteCollector.Visit("https://www.littlecaprice-dreams.com/collection/virtual-reality/")
118109
}
119110

120111
// Missing "Me and You" (my-first-time) scene
@@ -124,7 +115,7 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
124115
ctx.Put("cover","https://www.littlecaprice-dreams.com/wp-content/uploads/2021/08/wpp_Little-Caprice-Virtual-Reality_.jpg")
125116

126117
//sceneCollector.Visit(sceneURL)
127-
sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
118+
sceneCollector.Visit(sceneURL)
128119
}
129120

130121
ifupdateSite {
@@ -135,5 +126,5 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
135126
}
136127

137128
funcinit() {
138-
registerScraper("littlecaprice","Little Caprice Dreams","https://littlecaprice-dreams.com/wp-content/uploads/2019/03/cropped-lcd-heart-180x180.png","littlecaprice-dreams.com",LittleCaprice)
129+
registerScraper("littlecaprice","Little Caprice Dreams","https://www.littlecaprice-dreams.com/wp-content/uploads/2019/03/cropped-lcd-heart-192x192.png","littlecaprice-dreams.com",LittleCaprice)
139130
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp