Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit308d570

Browse files
author
Adam Bell
committed
SAVEPOINT
1 parent571b3cb commit308d570

File tree

5 files changed

+118
-30
lines changed

5 files changed

+118
-30
lines changed

‎build.sbt‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,10 @@ libraryDependencies ++= Seq(
5656
"io.lemonlabs"%%"scala-uri"%"1.3.1",
5757
// Ammonite
5858
"com.lihaoyi"%"ammonite"%"1.1.2"%"test" crossCrossVersion.full,
59-
"com.github.alexandrnikitin"%%"bloom-filter"%"latest.release"
59+
60+
//bloom filter
61+
"com.github.alexandrnikitin"%%"bloom-filter"%"latest.release",
62+
"commons-codec"%"commons-codec"%"1.9"
6063
)
6164

6265

‎src/main/scala/net/degoes/scraper/models.scala‎

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
packagenet.degoes.scraper
22

3-
importjava.nio.file.Path
3+
importjava.io.{File,FileWriter}
4+
importjava.nio.file.{Files,Path,Paths}
45

56
importnet.degoes.scraper.test.SiteIndex
67
importnet.degoes.scraper.url.URL
78
importscalaz.zio._
9+
importjava.nio.charset.StandardCharsets._
10+
importjava.nio.file.{Files,Paths}
11+
12+
importscala.util.Try
813

914
objectmodels {
1015

@@ -16,14 +21,27 @@ object models {
1621

1722
defgetURLCached(rootPath :Path):URL=>IO[Exception,String]=
1823
(url:URL)=> {
19-
//ToDo : check if exists in file system, if so return that
20-
21-
???
24+
valdefault= getURL(url)
25+
valcached=IO.sync{
26+
if(Files.exists(Paths.get(rootPath.toAbsolutePath.toString+"/"+ url.digest))){
27+
Some(newString(Files.readAllBytes(Paths.get("file.txt")),UTF_8))
28+
}else {
29+
None
30+
}
31+
}
32+
for {
33+
c<- cached
34+
d<- default
35+
}yield c.getOrElse(d)
2236
}
2337

2438
defwriteToCacheProcessor(rootPath :Path): (URL,String)=>IO[Unit,Unit]=
25-
//ToDo : make write to cache
26-
(url, html)=>IO.now(())
39+
(url, html)=> {
40+
IO.sync {
41+
valwriter=Try(newFileWriter(newFile(rootPath.toAbsolutePath.toString+"/"+ url.digest)))
42+
writer.map(w=> {w.write(html); w}).recoverWith{case _=> writer}.map(_.close)
43+
}
44+
}
2745

2846

2947
valIdProcessor: (URL,String)=>IO[Unit,List[(URL,String)]]=
@@ -35,5 +53,27 @@ object models {
3553
_<- writeToCacheProcessor(rootPath)(url, html)
3654
r<-IdProcessor(url,html)
3755
}yield r
56+
57+
privatevalblockingPool= java.util.concurrent.Executors.newCachedThreadPool()
58+
59+
defgetURL(url:URL):IO[Exception,String]=
60+
for {
61+
//eventually this would be IO.blocking
62+
promise<-Promise.make[Exception,String]
63+
_<- (for {
64+
exitResult<-IO.async[Nothing,ExitResult[Exception,String]](k=> blockingPool.submit(
65+
newRunnable () {
66+
defrun:Unit=
67+
try {
68+
k(ExitResult.Completed(ExitResult.Completed(scala.io.Source.fromURL(url.url)(scala.io.Codec.UTF8).mkString)))
69+
}catch {
70+
casee :Exception=> k(ExitResult.Completed(ExitResult.Failed(e)))
71+
}
72+
}
73+
)):IO[Nothing,ExitResult[Exception,String]]
74+
_<- promise.done(exitResult)
75+
}yield ()).fork
76+
html<- promise.get
77+
}yield html
3878
}
3979

‎src/main/scala/net/degoes/scraper/scraper.scala‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ object scraper {
1313
seeds :Set[URL],
1414
router :URL=>Set[URL],
1515
processor : (URL,String)=>IO[E,A],
16-
getURL :URL=>IO[Exception,String]= getURL(_)
16+
getURL :URL=>IO[Exception,String]=models.getURL(_)
1717
):IO[Nothing,Crawl[E,A]]= {
1818
defloop(seeds:Set[URL],ref:Ref[(Crawl[E,A],Set[URL])]):IO[Nothing,Unit]=
1919
IO.parTraverse(seeds)(url=>

‎src/main/scala/net/degoes/scraper/test.scala‎

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
packagenet.degoes.scraper
22

3+
importjava.nio.file.{Path,Paths}
4+
35
importscalaz.Monoid
46
importscalaz.zio.{App,ExitResult,IO,Promise}
57
importscalaz._
68
importScalaz.{mzero,_}
9+
importnet.degoes.scraper.test.{Home,Processor,getURL}
710
importnet.degoes.scraper.url.URL
811
importscalaz.zio.console._
912

@@ -49,3 +52,28 @@ object test extends App {
4952
_=>ExitStatus.ExitNow(0)
5053
)
5154
}
55+
56+
57+
objecttest1extendsApp {
58+
59+
valrootFilePath=Paths.get("/Users/abell/temp1")
60+
valstart=Set(
61+
URL("https://scalaz.github.io/7/").get
62+
)
63+
defrun(args:List[String]):IO[Nothing,ExitStatus]=
64+
(for {
65+
_<- putStrLn("Starting")
66+
rs<- scraper.crawlIOPar(
67+
start,
68+
models.stayInSeedDomainRouter(start),
69+
models.returnAndCache(rootFilePath),
70+
models.getURLCached(rootFilePath)
71+
)
72+
print= rs.value.map(_._1).mkString("\n")
73+
_<- putStrLn(s"results :\n$print")
74+
}yield
75+
()).redeemPure(
76+
_=>ExitStatus.ExitNow(1),
77+
_=>ExitStatus.ExitNow(0)
78+
)
79+
}

‎src/main/scala/net/degoes/scraper/url.scala‎

Lines changed: 39 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
packagenet.degoes.scraper
22

3+
importorg.apache.commons.codec.binary.Hex
4+
importorg.apache.commons.codec.digest.DigestUtils
35
importscalaz.Scalaz.{mzero,_}
46
importscalaz.{Monoid,_}
57
importscalaz.zio._
68

9+
importscala.util.Try
10+
711
objecturl {
812

913
finalcaseclassURLprivate (parsed: io.lemonlabs.uri.Url) {
@@ -24,6 +28,8 @@ object url {
2428

2529
defurl:String= parsed.toString
2630

31+
valdigest:SHA256Hash=SHA256Hash.create(parsed.toString())
32+
2733
overridedefequals(a:Any):Boolean= amatch {
2834
casethat :URL=>this.url== that.url
2935
case _=>false
@@ -42,28 +48,6 @@ object url {
4248
}
4349
}
4450

45-
privatevalblockingPool= java.util.concurrent.Executors.newCachedThreadPool()
46-
47-
defgetURL(url:URL):IO[Exception,String]=
48-
for {
49-
//eventually this would be IO.blocking
50-
promise<-Promise.make[Exception,String]
51-
_<- (for {
52-
exitResult<-IO.async[Nothing,ExitResult[Exception,String]](k=> blockingPool.submit(
53-
newRunnable () {
54-
defrun:Unit=
55-
try {
56-
k(ExitResult.Completed(ExitResult.Completed(scala.io.Source.fromURL(url.url)(scala.io.Codec.UTF8).mkString)))
57-
}catch {
58-
casee :Exception=> k(ExitResult.Completed(ExitResult.Failed(e)))
59-
}
60-
}
61-
)):IO[Nothing,ExitResult[Exception,String]]
62-
_<- promise.done(exitResult)
63-
}yield ()).fork
64-
html<- promise.get
65-
}yield html
66-
6751
defextractURLs(root:URL,html:String):List[URL]= {
6852
valpattern="href=[\"\']([^\"\']+)[\"\']".r
6953

@@ -76,4 +60,37 @@ object url {
7660
}yield url
7761
}).getOrElse(Nil)
7862
}
63+
64+
65+
66+
objectSHA256Hash {
67+
defparse(potentiallyValidHash:String):Option[SHA256Hash]=Try(newSHA256Hash(potentiallyValidHash)).toOption
68+
69+
defcreate(input :String):SHA256Hash=newSHA256Hash(DigestUtils.sha256(input))
70+
defunapply(raw:String):Option[SHA256Hash]= parse(raw)
71+
}
72+
73+
classSHA256Hash(potentiallyValidHash:String) {
74+
defthis(hashAsBytes:Array[Byte])= {
75+
this(Hex.encodeHexString(hashAsBytes))
76+
}
77+
78+
privatevalpotentiallyValidHashNoPrefix= potentiallyValidHash.stripPrefix("sha256:")
79+
lazyvalasStringWithPrefix:String="sha256:"+ potentiallyValidHashNoPrefix
80+
81+
overridedefequals(other:Any):Boolean= {
82+
othermatch {
83+
caseotherHash:SHA256Hash=> otherHash.asStringWithPrefix==this.asStringWithPrefix
84+
case _=>false
85+
}
86+
}
87+
88+
overridedefhashCode():Int= {
89+
this.asStringWithPrefix.hashCode//sloppy but it works
90+
}
91+
92+
defcanEqual(other:Any):Boolean= other.isInstanceOf[SHA256Hash]
93+
94+
overridedeftoString:String= asStringWithPrefix
95+
}
7996
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp