NotificationsYou must be signed in to change notification settings
Fork141
Star403

Commit219f19a

authored

scraper: Rewrite VirtualPorn using API as a Source (#1654)

1 parent42a24eb commit219f19aCopy full SHA for 219f19a

File tree

3 files changed

+208

-107

lines changed

pkg
- migrations
  - migrations.go
- models
  - model_external_reference.go
- scrape
  - virtualporn.go

3 files changed

+208

-107

lines changed

`‎pkg/migrations/migrations.go`

Lines changed: 7 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -1925,6 +1925,13 @@ func Migrate() {`
`1925`	`1925`	`returnerr`
`1926`	`1926`	`},`
`1927`	`1927`	`},`
	`1928`	`+{`
	`1929`	`+ID:"0077-Update-VirtualPorn-ids",`
	`1930`	`+Migrate:func(tx*gorm.DB)error {`
	`1931`	`+err:=scrape.UpdateVirtualPornIds()`
	`1932`	`+returnerr`
	`1933`	`+},`
	`1934`	`+},`
`1928`	`1935`	`})`
`1929`	`1936`
`1930`	`1937`	`iferr:=m.Migrate();err!=nil {`

`‎pkg/models/model_external_reference.go`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -518,7 +518,7 @@ func (scrapeRules ActorScraperConfig) buildGenericActorScraperRules() {`
`518`	`518`
`519`	`519`	`siteDetails=GenericScraperRuleSet{}`
`520`	`520`	`siteDetails.Domain="virtualporn.com"`
`521`		-siteDetails.SiteRules=append(siteDetails.SiteRules,GenericActorScraperRule{XbvrField:"image_url",Selector:`div.model__img-wrapper > img`,ResultType:"attr",Attribute:"src"})
	`521`	+siteDetails.SiteRules=append(siteDetails.SiteRules,GenericActorScraperRule{XbvrField:"image_url",Selector:`section[data-cy="actorProfilePicture"] img`,ResultType:"attr",Attribute:"src"})
`522`	`522`	`scrapeRules.GenericActorScrapingConfig["bvr scrape"]=siteDetails`
`523`	`523`
`524`	`524`	`siteDetails=GenericScraperRuleSet{}`

`‎pkg/scrape/virtualporn.go`

Lines changed: 200 additions & 106 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,150 +2,167 @@ package scrape`
`2`	`2`
`3`	`3`	`import (`
`4`	`4`	`"encoding/json"`
	`5`	`+"errors"`
	`6`	`+"regexp"`
`5`	`7`	`"strconv"`
`6`	`8`	`"strings"`
`7`	`9`	`"sync"`
`8`	`10`
`9`	`11`	`"github.com/gocolly/colly/v2"`
`10`		`-"github.com/nleeper/goment"`
	`12`	`+"github.com/mozillazg/go-slugify"`
`11`	`13`	`"github.com/thoas/go-funk"`
	`14`	`+"github.com/tidwall/gjson"`
`12`	`15`	`"github.com/xbapps/xbvr/pkg/models"`
`13`	`16`	`)`
`14`	`17`
`15`	`18`	`funcVirtualPorn(wg*sync.WaitGroup,updateSitebool,knownScenes []string,outchan<- models.ScrapedScene,singleSceneURLstring,singeScrapeAdditionalInfostring,limitScrapingbool)error {`
	`19`	`+// this scraper is non-standard in that it gathers info via an api rather than scraping html pages`
`16`	`20`	`deferwg.Done()`
`17`	`21`	`scraperID:="bvr"`
`18`	`22`	`siteID:="VirtualPorn"`
`19`	`23`	`logScrapeStart(scraperID,siteID)`
	`24`	`+nextApiUrl:=""`
`20`	`25`
`21`		`-sceneCollector:=createCollector("virtualporn.com")`
`22`	`26`	`siteCollector:=createCollector("virtualporn.com")`
`23`		`-pageCnt:=1`
`24`		`-`
`25`		-sceneCollector.OnHTML(`html`,func(e*colly.HTMLElement) {
`26`		`-sc:= models.ScrapedScene{}`
`27`		`-sc.ScraperID=scraperID`
`28`		`-sc.SceneType="VR"`
`29`		`-sc.Studio="BangBros"`
`30`		`-sc.Site=siteID`
`31`		`-sc.HomepageURL=strings.Split(e.Request.URL.String(),"?")[0]`
`32`		`-sc.MembersUrl="https://members.bangbros.com/product/655/movie/"+strings.Replace(strings.Split(e.Request.URL.String(),"/")[3],"video","",1)`
`33`		`-`
`34`		`-// Title / Cover / ID / Filenames`
`35`		-e.ForEach(`dl8-video`,func(idint,e*colly.HTMLElement) {
`36`		`-sc.Title=strings.TrimSpace(e.Attr("title"))`
`37`		`-`
`38`		`-tmpCover:=e.Request.AbsoluteURL(e.Request.AbsoluteURL(e.Attr("poster")))`
`39`		`-sc.Covers=append(sc.Covers,tmpCover)`
`40`		`-`
`41`		`-tmp:=strings.Split(tmpCover,"/")`
`42`		`-sc.SceneID=strings.Replace(tmp[5],"bvr","bvr-",1)`
`43`		`-`
`44`		-e.ForEach(`source`,func(idint,e*colly.HTMLElement) {
`45`		`-tmpFile:=strings.Split(e.Attr("src"),"/")`
`46`		`-sc.Filenames=append(sc.Filenames,strings.Replace(tmpFile[len(tmpFile)-1],"trailer-","",-1))`
	`27`	`+apiCollector:=createCollector("site-api.project1service.com")`
	`28`	`+offset:=0`
	`29`	`+`
	`30`	`+apiCollector.OnResponse(func(r*colly.Response) {`
	`31`	`+sceneListJson:=gjson.ParseBytes(r.Body)`
	`32`	`+`
	`33`	`+processScene:=func(scene gjson.Result) {`
	`34`	`+sc:= models.ScrapedScene{}`
	`35`	`+sc.ScraperID=scraperID`
	`36`	`+sc.SceneType="VR"`
	`37`	`+sc.Studio="BangBros"`
	`38`	`+sc.Site=siteID`
	`39`	`+id:=strconv.Itoa(int(scene.Get("id").Int()))`
	`40`	`+sc.SceneID="bvr-"+id`
	`41`	`+`
	`42`	`+sc.Title=scene.Get("title").String()`
	`43`	`+sc.HomepageURL="https://virtualporn.com/video/"+id+"/"+slugify.Slugify(strings.ReplaceAll(sc.Title,"'",""))`
	`44`	`+sc.MembersUrl="https://site-ma.virtualporn.com/scene/"+id+"/"+slugify.Slugify(strings.ReplaceAll(sc.Title,"'",""))`
	`45`	`+sc.Synopsis=scene.Get("description").String()`
	`46`	`+dateParts:=strings.Split(scene.Get("dateReleased").String(),"T")`
	`47`	`+sc.Released=dateParts[0]`
	`48`	`+`
	`49`	`+scene.Get("images.poster").ForEach(func(key,imgGroup gjson.Result)bool {`
	`50`	`+ifkey.String()=="0" {`
	`51`	`+imgurl:=imgGroup.Get("xl.urls.webp").String()`
	`52`	`+ifimgurl!="" {`
	`53`	`+sc.Covers=append(sc.Covers,imgurl)`
	`54`	`+}`
	`55`	`+`
	`56`	`+}else {`
	`57`	`+imgurl:=imgGroup.Get("xl.urls.webp").String()`
	`58`	`+ifimgurl!="" {`
	`59`	`+iflen(sc.Covers)==0 {`
	`60`	`+sc.Covers=append(sc.Covers,imgurl)`
	`61`	`+}else {`
	`62`	`+sc.Gallery=append(sc.Gallery,imgurl)`
	`63`	`+}`
	`64`	`+}`
	`65`	`+}`
	`66`	`+returntrue`
`47`	`67`	`})`
`48`		`-})`
`49`	`68`
`50`		`-file5kExists:=false`
`51`		`-for_,filename:=rangesc.Filenames {`
`52`		`-ifstrings.Contains(filename,"5k") {`
`53`		`-file5kExists=true`
`54`		`-}`
`55`		`-}`
`56`		`-if!file5kExists {`
`57`		`-sc.Filenames=append(sc.Filenames,strings.Replace(sc.SceneID,"bvr-","bvr",-1)+"-5k.mp4")`
`58`		`-}`
`59`		`-`
`60`		`-// Gallery`
`61`		-e.ForEach(`div.player__thumbs img`,func(idint,e*colly.HTMLElement) {
`62`		`-sc.Gallery=append(sc.Gallery,e.Attr("src"))`
`63`		`-})`
	`69`	`+// Cast`
	`70`	`+sc.ActorDetails=make(map[string]models.ActorDetails)`
	`71`	`+scene.Get("actors").ForEach(func(key,actor gjson.Result)bool {`
	`72`	`+name:=actor.Get("name").String()`
	`73`	`+ifactor.Get("gender").String()=="female" {`
	`74`	`+sc.Cast=append(sc.Cast,name)`
	`75`	`+}`
	`76`	`+sc.ActorDetails[actor.Get("name").String()]= models.ActorDetails{Source:scraperID+" scrape",ProfileUrl:"https://virtualporn.com/model/"+strconv.Itoa(int(actor.Get("id").Int()))+"/"+slugify.Slugify(name)}`
	`77`	`+returntrue`
	`78`	`+})`
`64`	`79`
`65`		`-// trailer details`
`66`		`-sc.TrailerType="scrape_html"`
`67`		`-params:= models.TrailerScrape{SceneUrl:sc.HomepageURL,HtmlElement:"dl8-video source",ContentPath:"src",QualityPath:"quality"}`
`68`		`-strParams,_:=json.Marshal(params)`
`69`		`-sc.TrailerSrc=string(strParams)`
`70`		`-`
`71`		`-// Cast`
`72`		`-sc.ActorDetails=make(map[string]models.ActorDetails)`
`73`		-e.ForEach(`div.player__stats p.player__stats__cast a`,func(idint,e*colly.HTMLElement) {
`74`		`-ifstrings.TrimSpace(e.Text)!="" {`
`75`		`-sc.Cast=append(sc.Cast,strings.TrimSpace(strings.ReplaceAll(e.Text,"!","")))`
`76`		`-sc.ActorDetails[strings.TrimSpace(strings.ReplaceAll(e.Text,"!",""))]= models.ActorDetails{Source:scraperID+" scrape",ProfileUrl:e.Request.AbsoluteURL(e.Attr("href"))}`
`77`		`-}`
`78`		`-})`
	`80`	`+// Tags`
	`81`	`+scene.Get("tags").ForEach(func(key,tag gjson.Result)bool {`
	`82`	`+iftag.Get("isVisible").Bool() {`
	`83`	`+sc.Tags=append(sc.Tags,tag.Get("name").String())`
	`84`	`+}`
	`85`	`+returntrue`
	`86`	`+})`
`79`	`87`
`80`		`-// Tags`
`81`		-e.ForEach(`div.video__tags__list a.tags`,func(idint,e*colly.HTMLElement) {
`82`		`-tag:=strings.TrimSpace(e.Text)`
`83`		`-iftag!="" {`
`84`		`-sc.Tags=append(sc.Tags,strings.ToLower(tag))`
`85`		`-}`
`86`		`-})`
	`88`	`+// trailer & filename details`
	`89`	`+sc.TrailerType="urls"`
	`90`	`+vartrailers []models.VideoSource`
	`91`	`+scene.Get("children").ForEach(func(key,child gjson.Result)bool {`
	`92`	`+child.Get("videos.full.files").ForEach(func(key,file gjson.Result)bool {`
	`93`	`+quality:=file.Get("format").String()`
	`94`	`+url:=file.Get("urls.view").String()`
	`95`	`+filename:=file.Get("urls.download").String()`
	`96`	`+ifurl!="" {`
	`97`	`+trailers=append(trailers, models.VideoSource{URL:url,Quality:quality})`
	`98`	`+}`
	`99`	`+pos:=strings.Index(filename,"?filename=")`
	`100`	`+ifpos!=-1 {`
	`101`	`+sc.Filenames=append(sc.Filenames,filename[pos+10:])`
	`102`	`+}`
	`103`	`+returntrue`
	`104`	`+})`
	`105`	`+returntrue`
	`106`	`+})`
	`107`	`+trailerJson,_:=json.Marshal(models.VideoSourceResponse{VideoSources:trailers})`
	`108`	`+sc.TrailerSrc=string(trailerJson)`
`87`	`109`
`88`		`-// Synposis`
`89`		-e.ForEach(`p.player__description`,func(idint,e*colly.HTMLElement) {
`90`		`-sc.Synopsis=strings.TrimSpace(e.Text)`
`91`		`-})`
	`110`	`+out<-sc`
`92`	`111`
`93`		`-// Release date / Duration`
`94`		`-tmpDate,_:=goment.New(strings.TrimSpace(e.Request.Ctx.GetAny("date").(string)),"MMM DD, YYYY")`
`95`		`-sc.Released=tmpDate.Format("YYYY-MM-DD")`
`96`		`-tmpDuration,err:=strconv.Atoi(strings.TrimSpace(strings.Replace(e.Request.Ctx.GetAny("dur").(string),"mins","",-1)))`
`97`		`-iferr==nil {`
`98`		`-sc.Duration=tmpDuration`
	`112`	`+}`
	`113`	`+total:=int(sceneListJson.Get("meta.total").Int())`
	`114`	`+scenes:=sceneListJson.Get("result")`
	`115`	`+ifstrings.Contains(r.Request.URL.RawQuery,"offset=") {`
	`116`	`+scenes.ForEach(func(key,scene gjson.Result)bool {`
	`117`	`+// check if we have the scene already`
	`118`	`+matches:=funk.Filter(knownScenes,func(sstring)bool {`
	`119`	`+returnstrings.Contains(s,scene.Get("id").String())`
	`120`	`+})`
	`121`	`+iffunk.IsEmpty(matches) {`
	`122`	`+processScene(scene)`
	`123`	`+}`
	`124`	`+returntrue`
	`125`	`+})`
	`126`	`+}else {`
	`127`	`+processScene(scenes)`
`99`	`128`	`}`
`100`	`129`
`101`		`-out<-sc`
`102`		`-})`
`103`		`-`
`104`		-siteCollector.OnHTML(`body`,func(e*colly.HTMLElement) {
`105`		`-sceneCnt:=0`
`106`		-e.ForEach(`div.recommended__item`,func(idint,e*colly.HTMLElement) {
`107`		`-sceneCnt+=1`
`108`		`-})`
`109`		`-`
`110`		`-ifsceneCnt>0 {`
`111`		`-pageCnt+=1`
	`130`	`+offset+=24`
	`131`	`+ifoffset<total {`
`112`	`132`	`if!limitScraping {`
`113`		`-siteCollector.Visit("https://virtualporn.com/videos/"+strconv.Itoa(pageCnt))`
	`133`	`+apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset="+strconv.Itoa(offset))`
`114`	`134`	`}`
`115`	`135`	`}`
`116`	`136`	`})`
`117`	`137`
`118`		-siteCollector.OnHTML(`div.recommended__item`,func(e*colly.HTMLElement) {
`119`		-sceneURL:=e.Request.AbsoluteURL(e.ChildAttr(`a`,"href"))
`120`		`-`
`121`		`-// If scene exist in database, there's no need to scrape`
`122`		`-if!funk.ContainsString(knownScenes,sceneURL) {`
`123`		`-`
`124`		`-//Date & Duration from main index`
`125`		`-ctx:=colly.NewContext()`
`126`		-e.ForEach(`span.recommended__item__info__date`,func(idint,e*colly.HTMLElement) {
`127`		`-ifid==0 {`
`128`		`-ctx.Put("date",strings.TrimSpace(e.Text))`
`129`		`-}`
`130`		`-})`
`131`		-e.ForEach(`span.recommended__item__time`,func(idint,e*colly.HTMLElement) {
`132`		`-ifid==0 {`
`133`		`-ctx.Put("dur",strings.TrimSpace(e.Text))`
`134`		`-}`
	`138`	+siteCollector.OnHTML(`script`,func(e*colly.HTMLElement) {
	`139`	`+// only interested in a script containg window\.__JUAN\.rawInstance`
	`140`	+re:=regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`)
	`141`	`+matches:=re.FindStringSubmatch(e.Text)`
	`142`	`+iflen(matches)>1 {`
	`143`	`+instanceJson:=gjson.ParseBytes([]byte(matches[1]))`
	`144`	`+token:=instanceJson.Get("jwt").String()`
	`145`	`+// set up api requests to use the token in the Instance Header`
	`146`	`+apiCollector.OnRequest(func(r*colly.Request) {`
	`147`	`+r.Headers.Set("Instance",token)`
`135`	`148`	`})`
`136`		`-`
`137`		`-sceneCollector.Request("GET",sceneURL,nil,ctx,nil)`
	`149`	`+apiCollector.Visit(nextApiUrl)`
`138`	`150`	`}`
`139`	`151`	`})`
`140`		`-`
`141`	`152`	`ifsingleSceneURL!="" {`
`142`	`153`	`ctx:=colly.NewContext()`
`143`	`154`	`ctx.Put("dur","")`
`144`	`155`	`ctx.Put("date","")`
	`156`	`+urlParts:=strings.Split(singleSceneURL,"/")`
	`157`	`+id:=urlParts[len(urlParts)-2]`
	`158`	`+offset=9999// do read more pages, we only need 1`
	`159`	`+nextApiUrl="https://site-api.project1service.com/v2/releases/"+id`
	`160`	`+siteCollector.Visit("https://virtualporn.com/videos")`
`145`	`161`
`146`		`-sceneCollector.Request("GET",singleSceneURL,nil,ctx,nil)`
`147`	`162`	`}else {`
`148`		`-siteCollector.Visit("https://virtualporn.com/videos/"+strconv.Itoa(pageCnt))`
	`163`	`+// call virtualporn.com, this is just to get the instance token to use the api for this session`
	`164`	`+nextApiUrl="https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset="+strconv.Itoa(offset)`
	`165`	`+siteCollector.Visit("https://virtualporn.com/videos")`
`149`	`166`	`}`
`150`	`167`
`151`	`168`	`ifupdateSite {`
`@@ -158,3 +175,80 @@ func VirtualPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out`
`158`	`175`	`funcinit() {`
`159`	`176`	`registerScraper("bvr","VirtualPorn","https://images.cn77nd.com/members/bangbros/favicon/apple-icon-60x60.png","virtualporn.com",VirtualPorn)`
`160`	`177`	`}`
	`178`	`+`
	`179`	`+// one off conversion routine called by migrations.go`
	`180`	`+funcUpdateVirtualPornIds()error {`
	`181`	`+collector:=createCollector("virtualporn.com")`
	`182`	`+apiCollector:=createCollector("site-api.project1service.com")`
	`183`	`+offset:=0`
	`184`	`+sceneCnt:=0`
	`185`	`+`
	`186`	+collector.OnHTML(`script`,func(e*colly.HTMLElement) {
	`187`	`+// only interested in a script containg window\.__JUAN\.rawInstance`
	`188`	+re:=regexp.MustCompile(`window\.__JUAN\.rawInstance = (\{.*?\});`)
	`189`	`+matches:=re.FindStringSubmatch(e.Text)`
	`190`	`+iflen(matches)>1 {`
	`191`	`+instanceJson:=gjson.ParseBytes([]byte(matches[1]))`
	`192`	`+token:=instanceJson.Get("jwt").String()`
	`193`	`+// set up api requests to use the token in the Instance Header`
	`194`	`+apiCollector.OnRequest(func(r*colly.Request) {`
	`195`	`+r.Headers.Set("Instance",token)`
	`196`	`+})`
	`197`	`+apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=100&offset="+strconv.Itoa(offset))`
	`198`	`+}`
	`199`	`+})`
	`200`	`+`
	`201`	`+apiCollector.OnResponse(func(r*colly.Response) {`
	`202`	`+db,_:=models.GetDB()`
	`203`	`+deferdb.Close()`
	`204`	`+`
	`205`	`+sceneListJson:=gjson.ParseBytes(r.Body)`
	`206`	`+sceneCnt=int(sceneListJson.Get("meta.total").Int())`
	`207`	`+scenes:=sceneListJson.Get("result")`
	`208`	`+scenes.ForEach(func(key,apiScene gjson.Result)bool {`
	`209`	`+id:=strconv.Itoa(int(apiScene.Get("id").Int()))`
	`210`	`+title:=apiScene.Get("title").String()`
	`211`	`+dateParts:=strings.Split(apiScene.Get("dateReleased").String(),"T")`
	`212`	`+releasedDate:=dateParts[0]`
	`213`	`+varscene models.Scene`
	`214`	`+scene.GetIfExist("bvr-"+id)`
	`215`	`+ifscene.ID>0 {`
	`216`	`+// get the next record, this one already matches the new id`
	`217`	`+returntrue`
	`218`	`+}`
	`219`	`+db.Where("scraper_id = ? and release_date_text = ?","bvr",releasedDate).Find(&scene)`
	`220`	`+ifscene.ID>0 {`
	`221`	`+oldSceneId:=scene.SceneID`
	`222`	`+log.Infof("Updating SceneId %s to %s ",oldSceneId,"bvr-"+id)`
	`223`	`+scene.LegacySceneID=scene.SceneID`
	`224`	`+scene.SceneID="bvr-"+id`
	`225`	`+scene.SceneURL="https://virtualporn.com/video/"+id+"/"+slugify.Slugify(strings.ReplaceAll(title,"'",""))`
	`226`	`+scene.MemberURL="https://site-ma.virtualporn.com/scene/"+id+"/"+slugify.Slugify(strings.ReplaceAll(title,"'",""))`
	`227`	`+`
	`228`	`+scene.Save()`
	`229`	`+result:=db.Model(&models.Action{}).Where("scene_id = ?",oldSceneId).Update("scene_id",scene.SceneID)`
	`230`	`+ifresult.Error!=nil {`
	`231`	`+log.Infof("Converting Actions for VirtualPorn Scene %s to %s failed, %s",oldSceneId,scene.SceneID,result.Error)`
	`232`	`+}`
	`233`	`+result=db.Model(&models.ExternalReferenceLink{}).Where("internal_table = 'scenes' and internal_name_id = ?",oldSceneId).Update("internal_name_id",scene.SceneID)`
	`234`	`+ifresult.Error!=nil {`
	`235`	`+log.Infof("Converting External Reference Links for VirtualPorn Scene %s to %s failed, %s",oldSceneId,scene.SceneID,result.Error)`
	`236`	`+}`
	`237`	`+}`
	`238`	`+returntrue`
	`239`	`+})`
	`240`	`+offset+=100`
	`241`	`+ifoffset<sceneCnt {`
	`242`	`+apiCollector.Visit("https://site-api.project1service.com/v2/releases?type=scene&limit=24&offset="+strconv.Itoa(offset))`
	`243`	`+}`
	`244`	`+})`
	`245`	`+`
	`246`	`+collector.Visit("https://virtualporn.com/videos")`
	`247`	`+`
	`248`	`+ifsceneCnt>0 {`
	`249`	`+returnnil`
	`250`	`+}else {`
	`251`	`+returnerrors.New("No scenes updated")`
	`252`	`+}`
	`253`	`+`
	`254`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit219f19a

File tree

3 files changed

3 files changed

`‎pkg/migrations/migrations.go`

`‎pkg/models/model_external_reference.go`

`‎pkg/scrape/virtualporn.go`

0 commit comments