Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit322e62e

Browse files
authored
scraper: Fix for SexbabesVR scraper (#1847)
* Fix For SexbabesVR ScraperThe scene id in the the webpage now seems to be 614 for all scenes. Causing all scenes to be rescraped and never adding new scenes.This pulls the poster url which appears to have a unique identifier in the 2nd to last directory .Also updated the cover URL to pull the image used for the thumbnail on the index page. As the latest scene has has a SBS image for the cover where the thumbnail contains a more useful imageAll appears functional* Remove Debug Prompts* Fix for the blank SynopsisThere are three separate variations on how they have this information posted depending on the age of the scene. A random sampling over all scenes shows that the synopsis is successfully being scraped* Add Migration CodeIt ran once I am unsure of how to properly test it tho.* Fix Logic* Improve Migration CodeAdded some error handling incase the website is unreachable.Added logic to ensure we only check scenes originating from SexBabesVR. Check only scenes starting at 600 as this is where the reported divergence between sceneID sources numbering occurred. And only update scenes that diverge in id
1 parentb0512d9 commit322e62e

File tree

2 files changed

+115
-9
lines changed

2 files changed

+115
-9
lines changed

‎pkg/migrations/migrations.go

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ import (
99
"path/filepath"
1010
"regexp"
1111
"runtime"
12+
"strconv"
1213
"strings"
1314
"time"
1415

1516
"github.com/go-resty/resty/v2"
17+
"github.com/gocolly/colly/v2"
1618
"github.com/jinzhu/gorm"
1719
"github.com/markphelps/optional"
1820
"github.com/mozillazg/go-slugify"
@@ -1986,6 +1988,90 @@ func Migrate() {
19861988
returntx.Model(&models.Tag{}).Exec("delete from tags where `count` = 0").Error
19871989
},
19881990
},
1991+
{
1992+
// Had to switch to a differnt sceneID source causing a shift in sceneIDs
1993+
ID:"0080-fix-SexBabesVR-ids",
1994+
Migrate:func(tx*gorm.DB)error {
1995+
newSceneId:=func(sitestring,urlstring) (string,int) {
1996+
sceneID:=""
1997+
statusCode:=200
1998+
1999+
sceneCollector:=colly.NewCollector(
2000+
colly.AllowedDomains("sexbabesvr.com"),
2001+
)
2002+
2003+
sceneCollector.OnError(func(r*colly.Response,errerror) {
2004+
common.Log.Errorf("Error visiting %s %s",r.Request.URL,err)
2005+
statusCode=r.StatusCode
2006+
})
2007+
2008+
sceneCollector.OnHTML(`html`,func(e*colly.HTMLElement) {
2009+
2010+
// Scene ID
2011+
e.ForEach(`dl8-video`,func(idint,e*colly.HTMLElement) {
2012+
posterURL:=e.Request.AbsoluteURL(e.Attr("poster"))
2013+
tmp:=strings.Split(posterURL,"/")
2014+
sceneID=slugify.Slugify(site)+"-"+tmp[len(tmp)-2]
2015+
})
2016+
})
2017+
2018+
sceneCollector.Visit(url)
2019+
2020+
returnsceneID,statusCode
2021+
}
2022+
2023+
varscenes []models.Scene
2024+
err:=tx.Where("studio = ?","SexBabesVR").Find(&scenes).Error
2025+
iferr!=nil {
2026+
returnerr
2027+
}
2028+
for_,scene:=rangescenes {
2029+
2030+
// Need both the siteID string and the sceneID has interger for logic
2031+
tmp:=strings.Split(scene.SceneID,"-")
2032+
sceneIDint,_:=strconv.Atoi(tmp[1])
2033+
2034+
// Check to make we only are updating scenes orginating on SexbabsVR and only starting at scene 600, sc.SiteID is is not accurate in terms of alt sites
2035+
// Scene 600 is where the scene IDs start to merge when changing our scene ID source for SexBabesVR
2036+
iftmp[0]=="sexbabesvr"&&sceneIDint>=600 {
2037+
2038+
common.Log.Infoln("Checking sceneid:",scene.SceneID)
2039+
sceneID,statusCode:=newSceneId(scene.Site,scene.SceneURL)
2040+
2041+
ifstatusCode!=200 {
2042+
returnerr
2043+
}
2044+
2045+
ifsceneID=="" {
2046+
common.Log.Warnf("Could not update scene %s",scene.SceneID)
2047+
continue
2048+
}
2049+
2050+
ifscene.SceneID!=sceneID {
2051+
// update all actions referring to this scene by its scene_id
2052+
err=tx.Model(&models.Action{}).Where("scene_id = ?",scene.SceneID).Update("scene_id",sceneID).Error
2053+
iferr!=nil {
2054+
returnerr
2055+
}
2056+
2057+
// update the scene itself
2058+
common.Log.Infoln("Updating sceneid:",scene.SceneID,"to",sceneID)
2059+
scene.SceneID=sceneID
2060+
err=tx.Save(&scene).Error
2061+
iferr!=nil {
2062+
returnerr
2063+
}
2064+
}
2065+
2066+
}
2067+
}
2068+
2069+
// since scenes have new IDs, we need to re-index them
2070+
tasks.SearchIndex()
2071+
2072+
returnnil
2073+
},
2074+
},
19892075
})
19902076

19912077
iferr:=m.Migrate();err!=nil {

‎pkg/scrape/sexbabesvr.go

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -30,13 +30,18 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out
3030
sc.Site=siteID
3131
sc.HomepageURL=strings.Split(e.Request.URL.String(),"?")[0]
3232

33-
// Scene ID -
33+
// Scene ID
3434
e.ForEach(`dl8-video`,func(idint,e*colly.HTMLElement) {
35-
sc.SiteID=e.Attr("data-scene")
35+
posterURL:=e.Request.AbsoluteURL(e.Attr("poster"))
36+
tmp:=strings.Split(posterURL,"/")
37+
sc.SiteID=tmp[len(tmp)-2]
3638
sc.SceneID=slugify.Slugify(sc.Site)+"-"+sc.SiteID
37-
sc.Covers=append(sc.Covers,strings.Replace(e.Attr("poster"),"/videoDetail2x","",-1))
3839
})
3940

41+
// Cover Url
42+
coverURL:=e.Request.Ctx.GetAny("coverURL").(string)
43+
sc.Covers=append(sc.Covers,coverURL)
44+
4045
// Title
4146
e.ForEach(`div.video-detail__description--container h1`,func(idint,e*colly.HTMLElement) {
4247
sc.Title=strings.TrimSpace(e.Text)
@@ -48,10 +53,22 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out
4853
})
4954

5055
// Synopsis
51-
e.ForEach(`div.video-detail>div.container>p`,func(idint,e*colly.HTMLElement) {
52-
// Handle blank <p></p> surrounding the synopsis
53-
ifstrings.TrimSpace(e.Text)!="" {
54-
sc.Synopsis=strings.TrimSpace(e.Text)
56+
e.ForEach(`div.list-of-categories__p`,func(idint,e*colly.HTMLElement) {
57+
synopsis:=e.Text
58+
59+
ifsynopsis=="" {
60+
synopsis=e.ChildText(`p.ql-align-justify`)
61+
62+
ifsynopsis=="" {
63+
e.ForEach(`div`,func(idint,e*colly.HTMLElement) {
64+
synopsis=synopsis+" "+strings.TrimSpace(e.Text)
65+
})
66+
67+
}
68+
}
69+
70+
ifstrings.TrimSpace(synopsis)!="" {
71+
sc.Synopsis=strings.TrimSpace(synopsis)
5572
}
5673
})
5774

@@ -104,10 +121,13 @@ func SexBabesVR(wg *models.ScrapeWG, updateSite bool, knownScenes []string, out
104121
})
105122

106123
siteCollector.OnHTML(`div.videos__content`,func(e*colly.HTMLElement) {
107-
e.ForEach(`a.video-container__description--title`,func(cntint,e*colly.HTMLElement) {
124+
e.ForEach(`a.video-container__image`,func(cntint,e*colly.HTMLElement) {
108125
sceneURL:=e.Request.AbsoluteURL(e.Attr("href"))
109126
if!funk.ContainsString(knownScenes,sceneURL) {
110-
sceneCollector.Visit(sceneURL)
127+
coverURL:=e.ChildAttr("a.video-container__image img","data-src")
128+
ctx:=colly.NewContext()
129+
ctx.Put("coverURL",coverURL)
130+
sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
111131
}
112132
})
113133
})

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp