1
1
package scrape
2
2
3
3
import (
4
- "regexp"
5
- "strconv"
4
+ "net/url"
6
5
"strings"
7
6
"sync"
8
7
"time"
@@ -23,12 +22,7 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
23
22
siteCollector := createCollector ("www.littlecaprice-dreams.com" )
24
23
galleryCollector := cloneCollector (sceneCollector )
25
24
26
- // RegEx Patterns
27
- coverRegEx := regexp .MustCompile (`\.vid_bg {\nbackground: url\('(.+?)'` )
28
- durationRegEx := regexp .MustCompile (`(\d+):(\d+)` )
29
- descriptionRegEx := regexp .MustCompile (`(?i)^e(?:nglish)?:` )
30
-
31
- sceneCollector .OnHTML (`article.project` ,func (e * colly.HTMLElement ) {
25
+ sceneCollector .OnHTML (`html` ,func (e * colly.HTMLElement ) {
32
26
sc := models.ScrapedScene {}
33
27
sc .ScraperID = scraperID
34
28
sc .SceneType = "VR"
@@ -37,52 +31,52 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
37
31
sc .HomepageURL = strings .Split (e .Request .URL .String (),"?" )[0 ]
38
32
39
33
// Scene ID - Generate randomly
40
- sc .SiteID = strings .Split (e .Attr ("id" ),"-" )[1 ]
34
+ e .ForEach (`link[rel="shortlink"]` ,func (id int ,e * colly.HTMLElement ) {
35
+ link := e .Request .AbsoluteURL (e .Attr ("href" ))
36
+ tmpurl ,_ := url .Parse (link )
37
+ sc .SiteID = tmpurl .Query ().Get ("p" )
38
+ })
41
39
sc .SceneID = slugify .Slugify (sc .Site + "-" + sc .SiteID )
42
40
43
41
// Title
44
- sc .Title = strings .TrimSpace (e .ChildText (`.vid_title` ))
42
+ e .ForEach (`.project-header h1` ,func (id int ,e * colly.HTMLElement ) {
43
+ if id == 0 {
44
+ sc .Title = strings .TrimSpace (e .Text )
45
+ }
46
+ })
45
47
46
48
// Cover
47
- cover := e .Request .Ctx .GetAny ("cover" ).(string )
48
- if len (cover )== 0 {
49
- cover = coverRegEx .FindStringSubmatch (e .DOM .Find (`style` ).Text ())[1 ]
50
- }
51
- cover = strings .Replace (cover ,"media." ,"" ,- 1 )
52
- sc .Covers = append (sc .Covers ,e .Request .AbsoluteURL (cover ))
49
+ e .ForEach (`meta[name="og:image"]` ,func (id int ,e * colly.HTMLElement ) {
50
+ if id == 0 {
51
+ sc .Covers = append (sc .Covers ,strings .Split (e .Request .AbsoluteURL (e .Attr ("content" )),"?" )[0 ])
52
+ }
53
+ })
53
54
54
55
// Duration
55
- minutes := durationRegEx .FindStringSubmatch (e .ChildText (`.vid_length` ))[1 ]
56
- sc .Duration ,_ = strconv .Atoi (minutes )
57
56
58
57
// Released
59
- dt ,_ := time .Parse ("January 2, 2006" ,e .ChildText (`.vid_date` ))
60
- sc .Released = dt .Format ("2006-01-02" )
58
+ e .ForEach (`meta[name="og:published_time"]` ,func (id int ,e * colly.HTMLElement ) {
59
+ dt ,_ := time .Parse ("2006-01-02" ,e .Attr ("content" )[:10 ])
60
+ sc .Released = dt .Format ("2006-01-02" )
61
+ })
61
62
62
63
// Synopsis
63
- sc . Synopsis = strings . TrimSpace (
64
- descriptionRegEx . ReplaceAllString ( // Some scene descriptions include a redundant prefix. We remove it.
65
- e . ChildText ( `.vid_desc` ), "" ) )
64
+ e . ForEach ( `.desc-text` , func ( id int , e * colly. HTMLElement ) {
65
+ sc . Synopsis = strings . TrimSpace ( e . Text )
66
+ } )
66
67
67
68
// Cast and tags
68
- e .ForEach (`.vid_infos .vid_info_content a` ,func (id int ,e * colly.HTMLElement ) {
69
- if e .Attr ("rel" )== "tag" {
70
- sc .Tags = append (sc .Tags ,strings .TrimSpace (e .Text ))
71
- }else {
72
- sc .Cast = append (sc .Cast ,strings .TrimSpace (e .Text ))
73
- }
69
+ e .ForEach (`.project-models .list a` ,func (id int ,e * colly.HTMLElement ) {
70
+ sc .Cast = append (sc .Cast ,strings .TrimSpace (e .Text ))
74
71
})
75
72
76
- // Gallery
77
- galleryPage ,_ := e .DOM .Find (`.vid_buttons a[href*="project"]` ).Attr ("href" )
78
- ctx := colly .NewContext ()
79
- ctx .Put ("scene" ,sc )
80
-
81
- galleryCollector .Request ("GET" ,galleryPage ,nil ,ctx ,nil )
73
+ // Tags
74
+ e .ForEach (`meta[name="og:video:tag"]` ,func (id int ,e * colly.HTMLElement ) {
75
+ sc .Tags = append (sc .Tags ,e .Attr ("content" ))
76
+ })
82
77
83
- if galleryPage == "" {
84
- out <- sc
85
- }
78
+ // Gallery
79
+ out <- sc
86
80
})
87
81
88
82
galleryCollector .OnHTML (`html` ,func (e * colly.HTMLElement ) {
@@ -96,16 +90,13 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
96
90
out <- sc
97
91
})
98
92
99
- siteCollector .OnHTML (`.et_pb_portfolio_item ` ,func (e * colly.HTMLElement ) {
100
- sceneURL := e .Request .AbsoluteURL (e .ChildAttr ( `a` , "href" ))
93
+ siteCollector .OnHTML (`.project-preview ` ,func (e * colly.HTMLElement ) {
94
+ sceneURL := e .Request .AbsoluteURL (e .Attr ( "href" ))
101
95
102
96
// If scene exists in database, there's no need to scrape
103
97
if ! funk .ContainsString (knownScenes ,sceneURL ) {
104
- ctx := colly .NewContext ()
105
- ctx .Put ("cover" ,e .ChildAttr ("img" ,"src" ))
106
-
107
98
//sceneCollector.Visit(sceneURL)
108
- sceneCollector .Request ("GET" ,sceneURL ,nil ,ctx ,nil )
99
+ sceneCollector .Request ("GET" ,sceneURL ,nil ,nil ,nil )
109
100
}
110
101
})
111
102
@@ -114,7 +105,7 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
114
105
ctx .Put ("cover" ,"" )
115
106
sceneCollector .Request ("GET" ,singleSceneURL ,nil ,ctx ,nil )
116
107
}else {
117
- siteCollector .Visit ("https://www.littlecaprice-dreams.com/virtual-reality-little-caprice-dreams /" )
108
+ siteCollector .Visit ("https://www.littlecaprice-dreams.com/collection/ virtual-reality/" )
118
109
}
119
110
120
111
// Missing "Me and You" (my-first-time) scene
@@ -124,7 +115,7 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
124
115
ctx .Put ("cover" ,"https://www.littlecaprice-dreams.com/wp-content/uploads/2021/08/wpp_Little-Caprice-Virtual-Reality_.jpg" )
125
116
126
117
//sceneCollector.Visit(sceneURL)
127
- sceneCollector .Request ( "GET" , sceneURL , nil , ctx , nil )
118
+ sceneCollector .Visit ( sceneURL )
128
119
}
129
120
130
121
if updateSite {
@@ -135,5 +126,5 @@ func LittleCaprice(wg *sync.WaitGroup, updateSite bool, knownScenes []string, ou
135
126
}
136
127
137
128
func init () {
138
- registerScraper ("littlecaprice" ,"Little Caprice Dreams" ,"https://littlecaprice-dreams.com/wp-content/uploads/2019/03/cropped-lcd-heart-180x180 .png" ,"littlecaprice-dreams.com" ,LittleCaprice )
129
+ registerScraper ("littlecaprice" ,"Little Caprice Dreams" ,"https://www. littlecaprice-dreams.com/wp-content/uploads/2019/03/cropped-lcd-heart-192x192 .png" ,"littlecaprice-dreams.com" ,LittleCaprice )
139
130
}