Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitdfc27a4

Browse files
authored
feat: Ability To Limit the Scraping Rates by Site & Scraper ie Domain (#1665)
* Ability To Limit the Scraping Rates by Site & Scraper ie Domain* Refactor to not use maps, they are not safe for concurrent thread access
1 parent7596564 commitdfc27a4

File tree

7 files changed

+145
-13
lines changed

7 files changed

+145
-13
lines changed

‎pkg/scrape/genericactorscraper.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,10 +266,13 @@ func applyRules(actorPage string, source string, rules models.GenericScraperRule
266266
}
267267
})
268268
}
269+
url,_:=url.Parse(actorPage)
269270
ifrules.IsJson {
270-
actorCollector.Request("GET",actorPage,nil,nil,nil)
271+
ScraperRateLimiterWait(url.Host)
272+
err:=actorCollector.Request("GET",actorPage,nil,nil,nil)
273+
ScraperRateLimiterCheckErrors(url.Host,err)
271274
}else {
272-
actorCollector.Visit(actorPage)
275+
WaitBeforeVisit(url.Host,actorCollector.Visit,actorPage)
273276
}
274277
varextref models.ExternalReference
275278
varextreflink models.ExternalReferenceLink

‎pkg/scrape/povr.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,21 +126,21 @@ func POVR(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<-
126126

127127
// If scene exists in database, or the slternate source exists, there's no need to scrape
128128
if!funk.ContainsString(knownScenes,sceneURL)&&!strings.Contains(sceneURL,"/join") {
129-
sceneCollector.Visit(sceneURL)
129+
WaitBeforeVisit("povr.com",sceneCollector.Visit,sceneURL)
130130
}
131131
})
132132

133133
siteCollector.OnHTML(`div.pagination a[class="pagination__page next"]`,func(e*colly.HTMLElement) {
134134
if!limitScraping {
135135
pageURL:=e.Request.AbsoluteURL(e.Attr("href"))
136-
siteCollector.Visit(pageURL)
136+
WaitBeforeVisit("povr.com",siteCollector.Visit,pageURL)
137137
}
138138
})
139139

140140
ifsingleSceneURL!="" {
141141
sceneCollector.Visit(singleSceneURL)
142142
}else {
143-
siteCollector.Visit(siteURL)
143+
WaitBeforeVisit("povr.com",siteCollector.Visit,siteURL)
144144
}
145145

146146
ifupdateSite {

‎pkg/scrape/scrape.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,25 @@ func createCollector(domains ...string) *colly.Collector {
3434
})
3535

3636
c=createCallbacks(c)
37+
38+
// see if the domain has a limit and set it
39+
for_,domain:=rangedomains {
40+
ifLimiters==nil {
41+
LoadScraperRateLimits()
42+
}
43+
limiter:=GetRateLimiter(domain)
44+
iflimiter!=nil {
45+
randomdelay:=limiter.maxDelay-limiter.minDelay
46+
delay:=limiter.minDelay
47+
c.Limit(&colly.LimitRule{
48+
DomainGlob:"*",
49+
Delay:delay,// Delay between requests to domains matching the glob
50+
RandomDelay:randomdelay,// Max additional random delay added to the delay
51+
})
52+
break
53+
}
54+
}
55+
3756
returnc
3857
}
3958

‎pkg/scrape/scrape_rate_limiter.go

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package scrape
2+
3+
import (
4+
"math/rand"
5+
"sync"
6+
"time"
7+
8+
"github.com/tidwall/gjson"
9+
"github.com/xbapps/xbvr/pkg/models"
10+
)
11+
12+
// Colly provides Rate Limiting on collectors and this works in most scrapers.
13+
// For scrapers that handle multiple sites, eg SLR, VRPorn, this does not work, as each
14+
// each site creates it's own instance of the scraper with it's own colly collector, and it's own independent limits
15+
16+
// The ScraperRateLimiter is provides a way to limit visits across multiple instances of the same scraper.
17+
// The calls to the colly collector Visit function must first be passed to the ScraperRateLimiter which will then coridinate
18+
// between all instances and then call the colly Visit function
19+
varLimiters []*ScraperRateLimiter
20+
21+
typeScraperRateLimiterstruct {
22+
idstring
23+
mutex sync.Mutex
24+
lastRequest time.Time
25+
minDelay time.Duration
26+
maxDelay time.Duration
27+
}
28+
29+
funcScraperRateLimiterWait(rateLimiterstring) {
30+
limiter:=GetRateLimiter(rateLimiter)
31+
iflimiter==nil {
32+
return
33+
}
34+
limiter.mutex.Lock()
35+
deferlimiter.mutex.Unlock()
36+
37+
iflimiter.lastRequest.IsZero() {
38+
// no previous time, don't wait
39+
limiter.lastRequest=time.Now()
40+
return
41+
}
42+
timeSinceLast:=time.Since(limiter.lastRequest)
43+
44+
delay:=limiter.minDelay
45+
iflimiter.maxDelay>limiter.minDelay {
46+
// Introduce a random delay between minDelay and maxDelay
47+
delay+=time.Duration(rand.Int63n(int64(limiter.maxDelay-limiter.minDelay)))
48+
}
49+
iftimeSinceLast<delay {
50+
time.Sleep(delay-timeSinceLast)
51+
}
52+
limiter.lastRequest=time.Now()
53+
}
54+
55+
funcWaitBeforeVisit(rateLimiterstring,visitFuncfunc(string)error,pageURLstring) {
56+
ScraperRateLimiterWait(rateLimiter)
57+
err:=visitFunc(pageURL)
58+
iferr!=nil {
59+
// if an err is returned, then a html a call was not made by colly. These are errors colly checks before calling the URL
60+
//ie the url has not been called. No need to wait before the next call, as the site was never visited
61+
limiter:=GetRateLimiter(rateLimiter)
62+
iflimiter!=nil {
63+
limiter.lastRequest= time.Time{}
64+
}
65+
}
66+
}
67+
funcScraperRateLimiterCheckErrors(domainstring,errerror) {
68+
iferr!=nil {
69+
limiter:=GetRateLimiter(domain)
70+
limiter.lastRequest= time.Time{}
71+
}
72+
}
73+
74+
funcLoadScraperRateLimits() {
75+
varmutex sync.Mutex
76+
mutex.Lock()
77+
defermutex.Unlock()
78+
79+
varlimiters []*ScraperRateLimiter
80+
commonDb,_:=models.GetCommonDB()
81+
varkv models.KV
82+
commonDb.Where(models.KV{Key:"scraper_rate_limits"}).Find(&kv)
83+
ifkv.Key=="scraper_rate_limits" {
84+
sites:=gjson.Get(kv.Value,"sites")
85+
for_,site:=rangesites.Array() {
86+
name:=site.Get("name").String()
87+
minDelay:=int(site.Get("mindelay").Int())
88+
maxDelay:=int(site.Get("maxdelay").Int())
89+
ifmaxDelay<minDelay {
90+
maxDelay=minDelay
91+
}
92+
limiters=append(limiters,&ScraperRateLimiter{id:name,minDelay:time.Duration(minDelay)*time.Millisecond,maxDelay:time.Duration(maxDelay)*time.Millisecond})
93+
}
94+
Limiters=limiters
95+
}
96+
}
97+
98+
funcGetRateLimiter(idstring)*ScraperRateLimiter {
99+
for_,limiter:=rangeLimiters {
100+
iflimiter.id==id {
101+
returnlimiter
102+
break
103+
}
104+
}
105+
returnnil
106+
}

‎pkg/scrape/slrstudios.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
305305
siteCollector.OnHTML(`div.c-pagination ul li a`,func(e*colly.HTMLElement) {
306306
if!limitScraping {
307307
pageURL:=e.Request.AbsoluteURL(e.Attr("href"))
308-
siteCollector.Visit(pageURL)
308+
WaitBeforeVisit("www.sexlikereal.com",siteCollector.Visit,pageURL)
309309
}
310310
})
311311

@@ -386,7 +386,9 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
386386
ctx:=colly.NewContext()
387387
ctx.Put("duration",duration)
388388
ctx.Put("isTransScene",isTransScene)
389-
sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
389+
ScraperRateLimiterWait("www.sexlikereal.com")
390+
err:=sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
391+
ScraperRateLimiterCheckErrors("www.sexlikereal.com",err)
390392
}
391393
}
392394
})
@@ -399,7 +401,7 @@ func SexLikeReal(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out
399401
sceneCollector.Request("GET",singleSceneURL,nil,ctx,nil)
400402

401403
}else {
402-
siteCollector.Visit(siteURL+"?sort=most_recent")
404+
WaitBeforeVisit("www.sexlikereal.com",siteCollector.Visit,siteURL+"?sort=most_recent")
403405
}
404406

405407
ifupdateSite {

‎pkg/scrape/vrphub.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,9 @@ func VRPHub(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
191191
ctx:=colly.NewContext()
192192
ctx.Put("scene",&sc)
193193

194-
sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
194+
ScraperRateLimiterWait("vrphub.com")
195+
err:=sceneCollector.Request("GET",sceneURL,nil,ctx,nil)
196+
ScraperRateLimiterCheckErrors("vrphub.com",err)
195197
}
196198
})
197199

@@ -201,7 +203,7 @@ func VRPHub(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
201203
ctx.Put("scene",&sc)
202204
sceneCollector.Request("GET",singleSceneURL,nil,ctx,nil)
203205
}else {
204-
siteCollector.Visit(siteURL)
206+
WaitBeforeVisit("vrphub.com",siteCollector.Visit,siteURL)
205207
}
206208

207209
ifupdateSite {

‎pkg/scrape/vrporn.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,22 +145,22 @@ func VRPorn(wg *sync.WaitGroup, updateSite bool, knownScenes []string, out chan<
145145
siteCollector.OnHTML(`div.pagination a.next`,func(e*colly.HTMLElement) {
146146
if!limitScraping {
147147
pageURL:=e.Request.AbsoluteURL(e.Attr("href"))
148-
siteCollector.Visit(pageURL)
148+
WaitBeforeVisit("vrporn.com",siteCollector.Visit,pageURL)
149149
}
150150
})
151151

152152
siteCollector.OnHTML(`body.tax-studio article.post div.tube-post a`,func(e*colly.HTMLElement) {
153153
sceneURL:=e.Request.AbsoluteURL(e.Attr("href"))
154154
// If scene exists in database, or the slternate source exists, there's no need to scrape
155155
if!funk.ContainsString(knownScenes,sceneURL) {
156-
sceneCollector.Visit(sceneURL)
156+
WaitBeforeVisit("vrporn.com",sceneCollector.Visit,sceneURL)
157157
}
158158
})
159159

160160
ifsingleSceneURL!="" {
161161
sceneCollector.Visit(singleSceneURL)
162162
}else {
163-
siteCollector.Visit(siteURL+"/?sort=newest")
163+
WaitBeforeVisit("vrporn.com",siteCollector.Visit,siteURL+"/?sort=newest")
164164
}
165165

166166
ifupdateSite {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp