@@ -2,150 +2,167 @@ package scrape
2
2
3
3
import (
4
4
"encoding/json"
5
+ "errors"
6
+ "regexp"
5
7
"strconv"
6
8
"strings"
7
9
"sync"
8
10
9
11
"github.com/gocolly/colly/v2"
10
- "github.com/nleeper/goment "
12
+ "github.com/mozillazg/go-slugify "
11
13
"github.com/thoas/go-funk"
14
+ "github.com/tidwall/gjson"
12
15
"github.com/xbapps/xbvr/pkg/models"
13
16
)
14
17
15
18
func VirtualPorn (wg * sync.WaitGroup ,updateSite bool ,knownScenes []string ,out chan <- models.ScrapedScene ,singleSceneURL string ,singeScrapeAdditionalInfo string ,limitScraping bool )error {
19
+ // this scraper is non-standard in that it gathers info via an api rather than scraping html pages
16
20
defer wg .Done ()
17
21
scraperID := "bvr"
18
22
siteID := "VirtualPorn"
19
23
logScrapeStart (scraperID ,siteID )
24
+ nextApiUrl := ""
20
25
21
- sceneCollector := createCollector ("virtualporn.com" )
22
26
siteCollector := createCollector ("virtualporn.com" )
23
- pageCnt := 1
24
-
25
- sceneCollector .OnHTML (`html` ,func (e * colly.HTMLElement ) {
26
- sc := models.ScrapedScene {}
27
- sc .ScraperID = scraperID
28
- sc .SceneType = "VR"
29
- sc .Studio = "BangBros"
30
- sc .Site = siteID
31
- sc .HomepageURL = strings .Split (e .Request .URL .String (),"?" )[0 ]
32
- sc .MembersUrl = "https://members.bangbros.com/product/655/movie/" + strings .Replace (strings .Split (e .Request .URL .String (),"/" )[3 ],"video" ,"" ,1 )
33
-
34
- // Title / Cover / ID / Filenames
35
- e .ForEach (`dl8-video` ,func (id int ,e * colly.HTMLElement ) {
36
- sc .Title = strings .TrimSpace (e .Attr ("title" ))
37
-
38
- tmpCover := e .Request .AbsoluteURL (e .Request .AbsoluteURL (e .Attr ("poster" )))
39
- sc .Covers = append (sc .Covers ,tmpCover )
40
-
41
- tmp := strings .Split (tmpCover ,"/" )
42
- sc .SceneID = strings .Replace (tmp [5 ],"bvr" ,"bvr-" ,1 )
43
-
44
- e .ForEach (`source` ,func (id int ,e * colly.HTMLElement ) {
45
- tmpFile := strings .Split (e .Attr ("src" ),"/" )
46
- sc .Filenames = append (sc .Filenames ,strings .Replace (tmpFile [len (tmpFile )- 1 ],"trailer-" ,"" ,- 1 ))
27
+ apiCollector := createCollector ("site-api.project1service.com" )
28
+ offset := 0
29
+
30
+ apiCollector .OnResponse (func (r * colly.Response ) {
31
+ sceneListJson := gjson .ParseBytes (r .Body )
32
+
33
+ processScene := func (scene gjson.Result ) {
34
+ sc := models.ScrapedScene {}
35
+ sc .ScraperID = scraperID
36
+ sc .SceneType = "VR"
37
+ sc .Studio = "BangBros"
38
+ sc .Site = siteID
39
+ id := strconv .Itoa (int (scene .Get ("id" ).Int ()))
40
+ sc .SceneID = "bvr-" + id
41
+
42
+ sc .Title = scene .Get ("title" ).String ()
43
+ sc .HomepageURL = "https://virtualporn.com/video/" + id + "/" + slugify .Slugify (strings .ReplaceAll (sc .Title ,"'" ,"" ))
44
+ sc .MembersUrl = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify .Slugify (strings .ReplaceAll (sc .Title ,"'" ,"" ))
45
+ sc .Synopsis = scene .Get ("description" ).String ()
46
+ dateParts := strings .Split (scene .Get ("dateReleased" ).String (),"T" )
47
+ sc .Released = dateParts [0 ]
48
+
49
+ scene .Get ("images.poster" ).ForEach (func (key ,imgGroup gjson.Result )bool {
50
+ if key .String ()== "0" {
51
+ imgurl := imgGroup .Get ("xl.urls.webp" ).String ()
52
+ if imgurl != "" {
53
+ sc .Covers = append (sc .Covers ,imgurl )
54
+ }
55
+
56
+ }else {
57
+ imgurl := imgGroup .Get ("xl.urls.webp" ).String ()
58
+ if imgurl != "" {
59
+ if len (sc .Covers )== 0 {
60
+ sc .Covers = append (sc .Covers ,imgurl )
61
+ }else {
62
+ sc .Gallery = append (sc .Gallery ,imgurl )
63
+ }
64
+ }
65
+ }
66
+ return true
47
67
})
48
- })
49
68
50
- file5kExists := false
51
- for _ ,filename := range sc .Filenames {
52
- if strings .Contains (filename ,"5k" ) {
53
- file5kExists = true
54
- }
55
- }
56
- if ! file5kExists {
57
- sc .Filenames = append (sc .Filenames ,strings .Replace (sc .SceneID ,"bvr-" ,"bvr" ,- 1 )+ "-5k.mp4" )
58
- }
59
-
60
- // Gallery
61
- e .ForEach (`div.player__thumbs img` ,func (id int ,e * colly.HTMLElement ) {
62
- sc .Gallery = append (sc .Gallery ,e .Attr ("src" ))
63
- })
69
+ // Cast
70
+ sc .ActorDetails = make (map [string ]models.ActorDetails )
71
+ scene .Get ("actors" ).ForEach (func (key ,actor gjson.Result )bool {
72
+ name := actor .Get ("name" ).String ()
73
+ if actor .Get ("gender" ).String ()== "female" {
74
+ sc .Cast = append (sc .Cast ,name )
75
+ }
76
+ sc .ActorDetails [actor .Get ("name" ).String ()]= models.ActorDetails {Source :scraperID + " scrape" ,ProfileUrl :"https://virtualporn.com/model/" + strconv .Itoa (int (actor .Get ("id" ).Int ()))+ "/" + slugify .Slugify (name )}
77
+ return true
78
+ })
64
79
65
- // trailer details
66
- sc .TrailerType = "scrape_html"
67
- params := models.TrailerScrape {SceneUrl :sc .HomepageURL ,HtmlElement :"dl8-video source" ,ContentPath :"src" ,QualityPath :"quality" }
68
- strParams ,_ := json .Marshal (params )
69
- sc .TrailerSrc = string (strParams )
70
-
71
- // Cast
72
- sc .ActorDetails = make (map [string ]models.ActorDetails )
73
- e .ForEach (`div.player__stats p.player__stats__cast a` ,func (id int ,e * colly.HTMLElement ) {
74
- if strings .TrimSpace (e .Text )!= "" {
75
- sc .Cast = append (sc .Cast ,strings .TrimSpace (strings .ReplaceAll (e .Text ,"!" ,"" )))
76
- sc .ActorDetails [strings .TrimSpace (strings .ReplaceAll (e .Text ,"!" ,"" ))]= models.ActorDetails {Source :scraperID + " scrape" ,ProfileUrl :e .Request .AbsoluteURL (e .Attr ("href" ))}
77
- }
78
- })
80
+ // Tags
81
+ scene .Get ("tags" ).ForEach (func (key ,tag gjson.Result )bool {
82
+ if tag .Get ("isVisible" ).Bool () {
83
+ sc .Tags = append (sc .Tags ,tag .Get ("name" ).String ())
84
+ }
85
+ return true
86
+ })
79
87
80
- // Tags
81
- e .ForEach (`div.video__tags__list a.tags` ,func (id int ,e * colly.HTMLElement ) {
82
- tag := strings .TrimSpace (e .Text )
83
- if tag != "" {
84
- sc .Tags = append (sc .Tags ,strings .ToLower (tag ))
85
- }
86
- })
88
+ // trailer & filename details
89
+ sc .TrailerType = "urls"
90
+ var trailers []models.VideoSource
91
+ scene .Get ("children" ).ForEach (func (key ,child gjson.Result )bool {
92
+ child .Get ("videos.full.files" ).ForEach (func (key ,file gjson.Result )bool {
93
+ quality := file .Get ("format" ).String ()
94
+ url := file .Get ("urls.view" ).String ()
95
+ filename := file .Get ("urls.download" ).String ()
96
+ if url != "" {
97
+ trailers = append (trailers , models.VideoSource {URL :url ,Quality :quality })
98
+ }
99
+ pos := strings .Index (filename ,"?filename=" )
100
+ if pos != - 1 {
101
+ sc .Filenames = append (sc .Filenames ,filename [pos + 10 :])
102
+ }
103
+ return true
104
+ })
105
+ return true
106
+ })
107
+ trailerJson ,_ := json .Marshal (models.VideoSourceResponse {VideoSources :trailers })
108
+ sc .TrailerSrc = string (trailerJson )
87
109
88
- // Synposis
89
- e .ForEach (`p.player__description` ,func (id int ,e * colly.HTMLElement ) {
90
- sc .Synopsis = strings .TrimSpace (e .Text )
91
- })
110
+ out <- sc
92
111
93
- // Release date / Duration
94
- tmpDate ,_ := goment .New (strings .TrimSpace (e .Request .Ctx .GetAny ("date" ).(string )),"MMM DD, YYYY" )
95
- sc .Released = tmpDate .Format ("YYYY-MM-DD" )
96
- tmpDuration ,err := strconv .Atoi (strings .TrimSpace (strings .Replace (e .Request .Ctx .GetAny ("dur" ).(string ),"mins" ,"" ,- 1 )))
97
- if err == nil {
98
- sc .Duration = tmpDuration
112
+ }
113
+ total := int (sceneListJson .Get ("meta.total" ).Int ())
114
+ scenes := sceneListJson .Get ("result" )
115
+ if strings .Contains (r .Request .URL .RawQuery ,"offset=" ) {
116
+ scenes .ForEach (func (key ,scene gjson.Result )bool {
117
+ // check if we have the scene already
118
+ matches := funk .Filter (knownScenes ,func (s string )bool {
119
+ return strings .Contains (s ,scene .Get ("id" ).String ())
120
+ })
121
+ if funk .IsEmpty (matches ) {
122
+ processScene (scene )
123
+ }
124
+ return true
125
+ })
126
+ }else {
127
+ processScene (scenes )
99
128
}
100
129
101
- out <- sc
102
- })
103
-
104
- siteCollector .OnHTML (`body` ,func (e * colly.HTMLElement ) {
105
- sceneCnt := 0
106
- e .ForEach (`div.recommended__item` ,func (id int ,e * colly.HTMLElement ) {
107
- sceneCnt += 1
108
- })
109
-
110
- if sceneCnt > 0 {
111
- pageCnt += 1
130
+ offset += 24
131
+ if offset < total {
112
132
if ! limitScraping {
113
- siteCollector .Visit ("https://virtualporn. com/videos/ " + strconv .Itoa (pageCnt ))
133
+ apiCollector .Visit ("https://site-api.project1service. com/v2/releases?type=scene&limit=24&offset= " + strconv .Itoa (offset ))
114
134
}
115
135
}
116
136
})
117
137
118
- siteCollector .OnHTML (`div.recommended__item` ,func (e * colly.HTMLElement ) {
119
- sceneURL := e .Request .AbsoluteURL (e .ChildAttr (`a` ,"href" ))
120
-
121
- // If scene exist in database, there's no need to scrape
122
- if ! funk .ContainsString (knownScenes ,sceneURL ) {
123
-
124
- //Date & Duration from main index
125
- ctx := colly .NewContext ()
126
- e .ForEach (`span.recommended__item__info__date` ,func (id int ,e * colly.HTMLElement ) {
127
- if id == 0 {
128
- ctx .Put ("date" ,strings .TrimSpace (e .Text ))
129
- }
130
- })
131
- e .ForEach (`span.recommended__item__time` ,func (id int ,e * colly.HTMLElement ) {
132
- if id == 0 {
133
- ctx .Put ("dur" ,strings .TrimSpace (e .Text ))
134
- }
138
+ siteCollector .OnHTML (`script` ,func (e * colly.HTMLElement ) {
139
+ // only interested in a script containg window\.__JUAN\.rawInstance
140
+ re := regexp .MustCompile (`window\.__JUAN\.rawInstance = (\{.*?\});` )
141
+ matches := re .FindStringSubmatch (e .Text )
142
+ if len (matches )> 1 {
143
+ instanceJson := gjson .ParseBytes ([]byte (matches [1 ]))
144
+ token := instanceJson .Get ("jwt" ).String ()
145
+ // set up api requests to use the token in the Instance Header
146
+ apiCollector .OnRequest (func (r * colly.Request ) {
147
+ r .Headers .Set ("Instance" ,token )
135
148
})
136
-
137
- sceneCollector .Request ("GET" ,sceneURL ,nil ,ctx ,nil )
149
+ apiCollector .Visit (nextApiUrl )
138
150
}
139
151
})
140
-
141
152
if singleSceneURL != "" {
142
153
ctx := colly .NewContext ()
143
154
ctx .Put ("dur" ,"" )
144
155
ctx .Put ("date" ,"" )
156
+ urlParts := strings .Split (singleSceneURL ,"/" )
157
+ id := urlParts [len (urlParts )- 2 ]
158
+ offset = 9999 // do read more pages, we only need 1
159
+ nextApiUrl = "https://site-api.project1service.com/v2/releases/" + id
160
+ siteCollector .Visit ("https://virtualporn.com/videos" )
145
161
146
- sceneCollector .Request ("GET" ,singleSceneURL ,nil ,ctx ,nil )
147
162
}else {
148
- siteCollector .Visit ("https://virtualporn.com/videos/" + strconv .Itoa (pageCnt ))
163
+ // call virtualporn.com, this is just to get the instance token to use the api for this session
164
+ nextApiUrl = "https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv .Itoa (offset )
165
+ siteCollector .Visit ("https://virtualporn.com/videos" )
149
166
}
150
167
151
168
if updateSite {
@@ -158,3 +175,80 @@ func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
158
175
func init () {
159
176
registerScraper ("bvr" ,"VirtualPorn" ,"https://images.cn77nd.com/members/bangbros/favicon/apple-icon-60x60.png" ,"virtualporn.com" ,VirtualPorn )
160
177
}
178
+
179
+ // one off conversion routine called by migrations.go
180
+ func UpdateVirtualPornIds ()error {
181
+ collector := createCollector ("virtualporn.com" )
182
+ apiCollector := createCollector ("site-api.project1service.com" )
183
+ offset := 0
184
+ sceneCnt := 0
185
+
186
+ collector .OnHTML (`script` ,func (e * colly.HTMLElement ) {
187
+ // only interested in a script containg window\.__JUAN\.rawInstance
188
+ re := regexp .MustCompile (`window\.__JUAN\.rawInstance = (\{.*?\});` )
189
+ matches := re .FindStringSubmatch (e .Text )
190
+ if len (matches )> 1 {
191
+ instanceJson := gjson .ParseBytes ([]byte (matches [1 ]))
192
+ token := instanceJson .Get ("jwt" ).String ()
193
+ // set up api requests to use the token in the Instance Header
194
+ apiCollector .OnRequest (func (r * colly.Request ) {
195
+ r .Headers .Set ("Instance" ,token )
196
+ })
197
+ apiCollector .Visit ("https://site-api.project1service.com/v2/releases?type=scene&limit=100&offset=" + strconv .Itoa (offset ))
198
+ }
199
+ })
200
+
201
+ apiCollector .OnResponse (func (r * colly.Response ) {
202
+ db ,_ := models .GetDB ()
203
+ defer db .Close ()
204
+
205
+ sceneListJson := gjson .ParseBytes (r .Body )
206
+ sceneCnt = int (sceneListJson .Get ("meta.total" ).Int ())
207
+ scenes := sceneListJson .Get ("result" )
208
+ scenes .ForEach (func (key ,apiScene gjson.Result )bool {
209
+ id := strconv .Itoa (int (apiScene .Get ("id" ).Int ()))
210
+ title := apiScene .Get ("title" ).String ()
211
+ dateParts := strings .Split (apiScene .Get ("dateReleased" ).String (),"T" )
212
+ releasedDate := dateParts [0 ]
213
+ var scene models.Scene
214
+ scene .GetIfExist ("bvr-" + id )
215
+ if scene .ID > 0 {
216
+ // get the next record, this one already matches the new id
217
+ return true
218
+ }
219
+ db .Where ("scraper_id = ? and release_date_text = ?" ,"bvr" ,releasedDate ).Find (& scene )
220
+ if scene .ID > 0 {
221
+ oldSceneId := scene .SceneID
222
+ log .Infof ("Updating SceneId %s to %s " ,oldSceneId ,"bvr-" + id )
223
+ scene .LegacySceneID = scene .SceneID
224
+ scene .SceneID = "bvr-" + id
225
+ scene .SceneURL = "https://virtualporn.com/video/" + id + "/" + slugify .Slugify (strings .ReplaceAll (title ,"'" ,"" ))
226
+ scene .MemberURL = "https://site-ma.virtualporn.com/scene/" + id + "/" + slugify .Slugify (strings .ReplaceAll (title ,"'" ,"" ))
227
+
228
+ scene .Save ()
229
+ result := db .Model (& models.Action {}).Where ("scene_id = ?" ,oldSceneId ).Update ("scene_id" ,scene .SceneID )
230
+ if result .Error != nil {
231
+ log .Infof ("Converting Actions for VirtualPorn Scene %s to %s failed, %s" ,oldSceneId ,scene .SceneID ,result .Error )
232
+ }
233
+ result = db .Model (& models.ExternalReferenceLink {}).Where ("internal_table = 'scenes' and internal_name_id = ?" ,oldSceneId ).Update ("internal_name_id" ,scene .SceneID )
234
+ if result .Error != nil {
235
+ log .Infof ("Converting External Reference Links for VirtualPorn Scene %s to %s failed, %s" ,oldSceneId ,scene .SceneID ,result .Error )
236
+ }
237
+ }
238
+ return true
239
+ })
240
+ offset += 100
241
+ if offset < sceneCnt {
242
+ apiCollector .Visit ("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset=" + strconv .Itoa (offset ))
243
+ }
244
+ })
245
+
246
+ collector .Visit ("https://virtualporn.com/videos" )
247
+
248
+ if sceneCnt > 0 {
249
+ return nil
250
+ }else {
251
+ return errors .New ("No scenes updated" )
252
+ }
253
+
254
+ }