Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit219f19a

Browse files
authored
scraper: Rewrite VirtualPorn using API as a Source (#1654)
1 parent42a24eb commit219f19a

File tree

3 files changed

+208
-107
lines changed

3 files changed

+208
-107
lines changed

‎pkg/migrations/migrations.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1925,6 +1925,13 @@ func Migrate() {
19251925
returnerr
19261926
},
19271927
},
1928+
{
1929+
ID:"0077-Update-VirtualPorn-ids",
1930+
Migrate:func(tx*gorm.DB)error {
1931+
err:=scrape.UpdateVirtualPornIds()
1932+
returnerr
1933+
},
1934+
},
19281935
})
19291936

19301937
iferr:=m.Migrate();err!=nil {

‎pkg/models/model_external_reference.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,7 @@ func (scrapeRules ActorScraperConfig) buildGenericActorScraperRules() {
518518

519519
siteDetails=GenericScraperRuleSet{}
520520
siteDetails.Domain="virtualporn.com"
521-
siteDetails.SiteRules=append(siteDetails.SiteRules,GenericActorScraperRule{XbvrField:"image_url",Selector:`div.model__img-wrapper > img`,ResultType:"attr",Attribute:"src"})
521+
siteDetails.SiteRules=append(siteDetails.SiteRules,GenericActorScraperRule{XbvrField:"image_url",Selector:`section[data-cy="actorProfilePicture"] img`,ResultType:"attr",Attribute:"src"})
522522
scrapeRules.GenericActorScrapingConfig["bvr scrape"]=siteDetails
523523

524524
siteDetails=GenericScraperRuleSet{}

‎pkg/scrape/virtualporn.go

Lines changed: 200 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -2,150 +2,167 @@ package scrape
22

33
import (
44
"encoding/json"
5+
"errors"
6+
"regexp"
57
"strconv"
68
"strings"
79
"sync"
810

911
"github.com/gocolly/colly/v2"
10-
"github.com/nleeper/goment"
12+
"github.com/mozillazg/go-slugify"
1113
"github.com/thoas/go-funk"
14+
"github.com/tidwall/gjson"
1215
"github.com/xbapps/xbvr/pkg/models"
1316
)
1417

1518
funcVirtualPorn(wg*sync.WaitGroup,updateSitebool,knownScenes []string,outchan<- models.ScrapedScene,singleSceneURLstring,singeScrapeAdditionalInfostring,limitScrapingbool)error {
19+
// this scraper is non-standard in that it gathers info via an api rather than scraping html pages
1620
deferwg.Done()
1721
scraperID:="bvr"
1822
siteID:="VirtualPorn"
1923
logScrapeStart(scraperID,siteID)
24+
nextApiUrl:=""
2025

21-
sceneCollector:=createCollector("virtualporn.com")
2226
siteCollector:=createCollector("virtualporn.com")
23-
pageCnt:=1
24-
25-
sceneCollector.OnHTML(`html`,func(e*colly.HTMLElement) {
26-
sc:= models.ScrapedScene{}
27-
sc.ScraperID=scraperID
28-
sc.SceneType="VR"
29-
sc.Studio="BangBros"
30-
sc.Site=siteID
31-
sc.HomepageURL=strings.Split(e.Request.URL.String(),"?")[0]
32-
sc.MembersUrl="https://members.bangbros.com/product/655/movie/"+strings.Replace(strings.Split(e.Request.URL.String(),"/")[3],"video","",1)
33-
34-
// Title / Cover / ID / Filenames
35-
e.ForEach(`dl8-video`,func(idint,e*colly.HTMLElement) {
36-
sc.Title=strings.TrimSpace(e.Attr("title"))
37-
38-
tmpCover:=e.Request.AbsoluteURL(e.Request.AbsoluteURL(e.Attr("poster")))
39-
sc.Covers=append(sc.Covers,tmpCover)
40-
41-
tmp:=strings.Split(tmpCover,"/")
42-
sc.SceneID=strings.Replace(tmp[5],"bvr","bvr-",1)
43-
44-
e.ForEach(`source`,func(idint,e*colly.HTMLElement) {
45-
tmpFile:=strings.Split(e.Attr("src"),"/")
46-
sc.Filenames=append(sc.Filenames,strings.Replace(tmpFile[len(tmpFile)-1],"trailer-","",-1))
27+
apiCollector:=createCollector("site-api.project1service.com")
28+
offset:=0
29+
30+
apiCollector.OnResponse(func(r*colly.Response) {
31+
sceneListJson:=gjson.ParseBytes(r.Body)
32+
33+
processScene:=func(scene gjson.Result) {
34+
sc:= models.ScrapedScene{}
35+
sc.ScraperID=scraperID
36+
sc.SceneType="VR"
37+
sc.Studio="BangBros"
38+
sc.Site=siteID
39+
id:=strconv.Itoa(int(scene.Get("id").Int()))
40+
sc.SceneID="bvr-"+id
41+
42+
sc.Title=scene.Get("title").String()
43+
sc.HomepageURL="https://virtualporn.com/video/"+id+"/"+slugify.Slugify(strings.ReplaceAll(sc.Title,"'",""))
44+
sc.MembersUrl="https://site-ma.virtualporn.com/scene/"+id+"/"+slugify.Slugify(strings.ReplaceAll(sc.Title,"'",""))
45+
sc.Synopsis=scene.Get("description").String()
46+
dateParts:=strings.Split(scene.Get("dateReleased").String(),"T")
47+
sc.Released=dateParts[0]
48+
49+
scene.Get("images.poster").ForEach(func(key,imgGroup gjson.Result)bool {
50+
ifkey.String()=="0" {
51+
imgurl:=imgGroup.Get("xl.urls.webp").String()
52+
ifimgurl!="" {
53+
sc.Covers=append(sc.Covers,imgurl)
54+
}
55+
56+
}else {
57+
imgurl:=imgGroup.Get("xl.urls.webp").String()
58+
ifimgurl!="" {
59+
iflen(sc.Covers)==0 {
60+
sc.Covers=append(sc.Covers,imgurl)
61+
}else {
62+
sc.Gallery=append(sc.Gallery,imgurl)
63+
}
64+
}
65+
}
66+
returntrue
4767
})
48-
})
4968

50-
file5kExists:=false
51-
for_,filename:=rangesc.Filenames {
52-
ifstrings.Contains(filename,"5k") {
53-
file5kExists=true
54-
}
55-
}
56-
if!file5kExists {
57-
sc.Filenames=append(sc.Filenames,strings.Replace(sc.SceneID,"bvr-","bvr",-1)+"-5k.mp4")
58-
}
59-
60-
// Gallery
61-
e.ForEach(`div.player__thumbs img`,func(idint,e*colly.HTMLElement) {
62-
sc.Gallery=append(sc.Gallery,e.Attr("src"))
63-
})
69+
// Cast
70+
sc.ActorDetails=make(map[string]models.ActorDetails)
71+
scene.Get("actors").ForEach(func(key,actor gjson.Result)bool {
72+
name:=actor.Get("name").String()
73+
ifactor.Get("gender").String()=="female" {
74+
sc.Cast=append(sc.Cast,name)
75+
}
76+
sc.ActorDetails[actor.Get("name").String()]= models.ActorDetails{Source:scraperID+" scrape",ProfileUrl:"https://virtualporn.com/model/"+strconv.Itoa(int(actor.Get("id").Int()))+"/"+slugify.Slugify(name)}
77+
returntrue
78+
})
6479

65-
// trailer details
66-
sc.TrailerType="scrape_html"
67-
params:= models.TrailerScrape{SceneUrl:sc.HomepageURL,HtmlElement:"dl8-video source",ContentPath:"src",QualityPath:"quality"}
68-
strParams,_:=json.Marshal(params)
69-
sc.TrailerSrc=string(strParams)
70-
71-
// Cast
72-
sc.ActorDetails=make(map[string]models.ActorDetails)
73-
e.ForEach(`div.player__stats p.player__stats__cast a`,func(idint,e*colly.HTMLElement) {
74-
ifstrings.TrimSpace(e.Text)!="" {
75-
sc.Cast=append(sc.Cast,strings.TrimSpace(strings.ReplaceAll(e.Text,"!","")))
76-
sc.ActorDetails[strings.TrimSpace(strings.ReplaceAll(e.Text,"!",""))]= models.ActorDetails{Source:scraperID+" scrape",ProfileUrl:e.Request.AbsoluteURL(e.Attr("href"))}
77-
}
78-
})
80+
// Tags
81+
scene.Get("tags").ForEach(func(key,tag gjson.Result)bool {
82+
iftag.Get("isVisible").Bool() {
83+
sc.Tags=append(sc.Tags,tag.Get("name").String())
84+
}
85+
returntrue
86+
})
7987

80-
// Tags
81-
e.ForEach(`div.video__tags__list a.tags`,func(idint,e*colly.HTMLElement) {
82-
tag:=strings.TrimSpace(e.Text)
83-
iftag!="" {
84-
sc.Tags=append(sc.Tags,strings.ToLower(tag))
85-
}
86-
})
88+
// trailer & filename details
89+
sc.TrailerType="urls"
90+
vartrailers []models.VideoSource
91+
scene.Get("children").ForEach(func(key,child gjson.Result)bool {
92+
child.Get("videos.full.files").ForEach(func(key,file gjson.Result)bool {
93+
quality:=file.Get("format").String()
94+
url:=file.Get("urls.view").String()
95+
filename:=file.Get("urls.download").String()
96+
ifurl!="" {
97+
trailers=append(trailers, models.VideoSource{URL:url,Quality:quality})
98+
}
99+
pos:=strings.Index(filename,"?filename=")
100+
ifpos!=-1 {
101+
sc.Filenames=append(sc.Filenames,filename[pos+10:])
102+
}
103+
returntrue
104+
})
105+
returntrue
106+
})
107+
trailerJson,_:=json.Marshal(models.VideoSourceResponse{VideoSources:trailers})
108+
sc.TrailerSrc=string(trailerJson)
87109

88-
// Synposis
89-
e.ForEach(`p.player__description`,func(idint,e*colly.HTMLElement) {
90-
sc.Synopsis=strings.TrimSpace(e.Text)
91-
})
110+
out<-sc
92111

93-
// Release date / Duration
94-
tmpDate,_:=goment.New(strings.TrimSpace(e.Request.Ctx.GetAny("date").(string)),"MMM DD, YYYY")
95-
sc.Released=tmpDate.Format("YYYY-MM-DD")
96-
tmpDuration,err:=strconv.Atoi(strings.TrimSpace(strings.Replace(e.Request.Ctx.GetAny("dur").(string),"mins","",-1)))
97-
iferr==nil {
98-
sc.Duration=tmpDuration
112+
}
113+
total:=int(sceneListJson.Get("meta.total").Int())
114+
scenes:=sceneListJson.Get("result")
115+
ifstrings.Contains(r.Request.URL.RawQuery,"offset=") {
116+
scenes.ForEach(func(key,scene gjson.Result)bool {
117+
// check if we have the scene already
118+
matches:=funk.Filter(knownScenes,func(sstring)bool {
119+
returnstrings.Contains(s,scene.Get("id").String())
120+
})
121+
iffunk.IsEmpty(matches) {
122+
processScene(scene)
123+
}
124+
returntrue
125+
})
126+
}else {
127+
processScene(scenes)
99128
}
100129

101-
out<-sc
102-
})
103-
104-
siteCollector.OnHTML(`body`,func(e*colly.HTMLElement) {
105-
sceneCnt:=0
106-
e.ForEach(`div.recommended__item`,func(idint,e*colly.HTMLElement) {
107-
sceneCnt+=1
108-
})
109-
110-
ifsceneCnt>0 {
111-
pageCnt+=1
130+
offset+=24
131+
ifoffset<total {
112132
if!limitScraping {
113-
siteCollector.Visit("https://virtualporn.com/videos/"+strconv.Itoa(pageCnt))
133+
apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset="+strconv.Itoa(offset))
114134
}
115135
}
116136
})
117137

118-
siteCollector.OnHTML(`div.recommended__item`,func(e*colly.HTMLElement) {
119-
sceneURL:=e.Request.AbsoluteURL(e.ChildAttr(`a`,"href"))
120-
121-
// If scene exist in database, there's no need to scrape
122-
if!funk.ContainsString(knownScenes,sceneURL) {
123-
124-
//Date & Duration from main index
125-
ctx:=colly.NewContext()
126-
e.ForEach(`span.recommended__item__info__date`,func(idint,e*colly.HTMLElement) {
127-
ifid==0 {
128-
ctx.Put("date",strings.TrimSpace(e.Text))
129-
}
130-
})
131-
e.ForEach(`span.recommended__item__time`,func(idint,e*colly.HTMLElement) {
132-
ifid==0 {
133-
ctx.Put("dur",strings.TrimSpace(e.Text))
134-
}
138+
siteCollector.OnHTML(`script`,func(e*colly.HTMLElement) {
139+
// only interested in a script containg window\.__JUAN\.rawInstance
140+
re:=regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`)
141+
matches:=re.FindStringSubmatch(e.Text)
142+
iflen(matches)>1 {
143+
instanceJson:=gjson.ParseBytes([]byte(matches[1]))
144+
token:=instanceJson.Get("jwt").String()
145+
// set up api requests to use the token in the Instance Header
146+
apiCollector.OnRequest(func(r*colly.Request) {
147+
r.Headers.Set("Instance",token)
135148
})
136-
137-
sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
149+
apiCollector.Visit(nextApiUrl)
138150
}
139151
})
140-
141152
ifsingleSceneURL!="" {
142153
ctx:=colly.NewContext()
143154
ctx.Put("dur","")
144155
ctx.Put("date","")
156+
urlParts:=strings.Split(singleSceneURL,"/")
157+
id:=urlParts[len(urlParts)-2]
158+
offset=9999// do read more pages, we only need 1
159+
nextApiUrl="https://site-api.project1service.com/v2/releases/"+id
160+
siteCollector.Visit("https://virtualporn.com/videos")
145161

146-
sceneCollector.Request("GET",singleSceneURL,nil,ctx,nil)
147162
}else {
148-
siteCollector.Visit("https://virtualporn.com/videos/"+strconv.Itoa(pageCnt))
163+
// call virtualporn.com, this is just to get the instance token to use the api for this session
164+
nextApiUrl="https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset="+strconv.Itoa(offset)
165+
siteCollector.Visit("https://virtualporn.com/videos")
149166
}
150167

151168
ifupdateSite {
@@ -158,3 +175,80 @@ func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
158175
funcinit() {
159176
registerScraper("bvr","VirtualPorn","https://images.cn77nd.com/members/bangbros/favicon/apple-icon-60x60.png","virtualporn.com",VirtualPorn)
160177
}
178+
179+
// one off conversion routine called by migrations.go
180+
funcUpdateVirtualPornIds()error {
181+
collector:=createCollector("virtualporn.com")
182+
apiCollector:=createCollector("site-api.project1service.com")
183+
offset:=0
184+
sceneCnt:=0
185+
186+
collector.OnHTML(`script`,func(e*colly.HTMLElement) {
187+
// only interested in a script containg window\.__JUAN\.rawInstance
188+
re:=regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`)
189+
matches:=re.FindStringSubmatch(e.Text)
190+
iflen(matches)>1 {
191+
instanceJson:=gjson.ParseBytes([]byte(matches[1]))
192+
token:=instanceJson.Get("jwt").String()
193+
// set up api requests to use the token in the Instance Header
194+
apiCollector.OnRequest(func(r*colly.Request) {
195+
r.Headers.Set("Instance",token)
196+
})
197+
apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=100&offset="+strconv.Itoa(offset))
198+
}
199+
})
200+
201+
apiCollector.OnResponse(func(r*colly.Response) {
202+
db,_:=models.GetDB()
203+
deferdb.Close()
204+
205+
sceneListJson:=gjson.ParseBytes(r.Body)
206+
sceneCnt=int(sceneListJson.Get("meta.total").Int())
207+
scenes:=sceneListJson.Get("result")
208+
scenes.ForEach(func(key,apiScene gjson.Result)bool {
209+
id:=strconv.Itoa(int(apiScene.Get("id").Int()))
210+
title:=apiScene.Get("title").String()
211+
dateParts:=strings.Split(apiScene.Get("dateReleased").String(),"T")
212+
releasedDate:=dateParts[0]
213+
varscene models.Scene
214+
scene.GetIfExist("bvr-"+id)
215+
ifscene.ID>0 {
216+
// get the next record, this one already matches the new id
217+
returntrue
218+
}
219+
db.Where("scraper_id = ? and release_date_text = ?","bvr",releasedDate).Find(&scene)
220+
ifscene.ID>0 {
221+
oldSceneId:=scene.SceneID
222+
log.Infof("Updating SceneId %s to %s ",oldSceneId,"bvr-"+id)
223+
scene.LegacySceneID=scene.SceneID
224+
scene.SceneID="bvr-"+id
225+
scene.SceneURL="https://virtualporn.com/video/"+id+"/"+slugify.Slugify(strings.ReplaceAll(title,"'",""))
226+
scene.MemberURL="https://site-ma.virtualporn.com/scene/"+id+"/"+slugify.Slugify(strings.ReplaceAll(title,"'",""))
227+
228+
scene.Save()
229+
result:=db.Model(&models.Action{}).Where("scene_id = ?",oldSceneId).Update("scene_id",scene.SceneID)
230+
ifresult.Error!=nil {
231+
log.Infof("Converting Actions for VirtualPorn Scene %s to %s failed, %s",oldSceneId,scene.SceneID,result.Error)
232+
}
233+
result=db.Model(&models.ExternalReferenceLink{}).Where("internal_table = 'scenes' and internal_name_id = ?",oldSceneId).Update("internal_name_id",scene.SceneID)
234+
ifresult.Error!=nil {
235+
log.Infof("Converting External Reference Links for VirtualPorn Scene %s to %s failed, %s",oldSceneId,scene.SceneID,result.Error)
236+
}
237+
}
238+
returntrue
239+
})
240+
offset+=100
241+
ifoffset<sceneCnt {
242+
apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset="+strconv.Itoa(offset))
243+
}
244+
})
245+
246+
collector.Visit("https://virtualporn.com/videos")
247+
248+
ifsceneCnt>0 {
249+
returnnil
250+
}else {
251+
returnerrors.New("No scenes updated")
252+
}
253+
254+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp