spdcoder/python-spark-tutorialPublic

forked fromjleetutorial/python-spark-tutorial

NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commit76f5cce

Pedro Bernardo

committed

Changed scripts to use the conf paramenter on the SparkContext constructor & Removed setLogLevel

1 parenta6dc078 commit76f5cceCopy full SHA for 76f5cce

File tree

14 files changed

+77

-54

lines changed

pairRdd
- aggregation/reducebykey
  - WordCount.py
- create
  - PairRddFromRegularRdd.py
  - PairRddFromTupleList.py
- groupbykey
  - GroupByKeyVsReduceByKey.py
- join
  - JoinOperations.py
rdd
- WordCount.py
- collect
  - CollectExample.py
- count
  - CountExample.py
- nasaApacheWebLogs
  - SameHostsSolution.py
  - UnionLogSolutions.py
- persist
  - PersistExample.py
- reduce
  - ReduceExample.py
- sumOfNumbers
  - SumOfNumbersSolution.py
- take
  - TakeExample.py

14 files changed

+77

-54

lines changed

`‎pairRdd/aggregation/reducebykey/WordCount.py`

Lines changed: 3 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,8 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-`
`5`		`-sc=SparkContext("local","wordCounts")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("wordCounts").setMaster("local[3]")`
	`5`	`+sc=SparkContext(conf=conf)`
`7`	`6`
`8`	`7`	`lines=sc.textFile("in/word_count.text")`
`9`	`8`	`wordRdd=lines.flatMap(lambdaline:line.split(" "))`

`‎pairRdd/create/PairRddFromRegularRdd.py`

Lines changed: 3 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,8 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-`
`5`		`-sc=SparkContext("local","create")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("create").setMaster("local")`
	`5`	`+sc=SparkContext(conf=conf)`
`7`	`6`
`8`	`7`	`inputStrings= ["Lily 23","Jack 29","Mary 29","James 8"]`
`9`	`8`	`regularRDDs=sc.parallelize(inputStrings)`

`‎pairRdd/create/PairRddFromTupleList.py`

Lines changed: 3 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,8 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-`
`5`		`-sc=SparkContext("local","create")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("create").setMaster("local")`
	`5`	`+sc=SparkContext(conf=conf)`
`7`	`6`
`8`	`7`	`tuples= [("Lily",23), ("Jack",29), ("Mary",29), ("James",8)]`
`9`	`8`	`pairRDD=sc.parallelize(tuples)`

`‎pairRdd/groupbykey/GroupByKeyVsReduceByKey.py`

Lines changed: 9 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,18 +1,21 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-`
`5`		`-sc=SparkContext("local","GroupByKeyVsReduceByKey")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName('GroupByKeyVsReduceByKey').setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
`7`	`6`
`8`	`7`	`words= ["one","two","two","three","three","three"]`
`9`	`8`	`wordsPairRdd=sc.parallelize(words).map(lambdaword: (word,1))`
`10`	`9`
`11`		`-wordCountsWithReduceByKey=wordsPairRdd.reduceByKey(lambdax,y:x+y).collect()`
	`10`	`+wordCountsWithReduceByKey=wordsPairRdd \`
	`11`	`+ .reduceByKey(lambdax,y:x+y) \`
	`12`	`+ .collect()`
`12`	`13`	`print("wordCountsWithReduceByKey: {}".format(list(wordCountsWithReduceByKey)))`
`13`	`14`
`14`	`15`	`wordCountsWithGroupByKey=wordsPairRdd \`
`15`	`16`	`.groupByKey() \`
`16`		`- .mapValues(lambdaintIterable:len(intIterable)) \`
	`17`	`+ .mapValues(len) \`
`17`	`18`	`.collect()`
`18`	`19`	`print("wordCountsWithGroupByKey: {}".format(list(wordCountsWithGroupByKey)))`
	`20`	`+`
	`21`	`+`

`‎pairRdd/join/JoinOperations.py`

Lines changed: 3 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,8 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-`
`5`		`-sc=SparkContext("local","JoinOperations")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("JoinOperations").setMaster("local[1]")`
	`5`	`+sc=SparkContext(conf=conf)`
`7`	`6`
`8`	`7`	`ages=sc.parallelize([("Tom",29), ("John",22)])`
`9`	`8`	`addresses=sc.parallelize([("James","USA"), ("John","UK")])`

`‎rdd/WordCount.py`

Lines changed: 9 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,15 @@`
`1`		`-importsys`
`2`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`3`	`2`
`4`	`3`	`if__name__=="__main__":`
`5`		`-sc=SparkContext("local","word count")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("word count").setMaster("local[3]")`
	`5`	`+sc=SparkContext(conf=conf)`
	`6`	`+`
`7`	`7`	`lines=sc.textFile("in/word_count.text")`
	`8`	`+`
`8`	`9`	`words=lines.flatMap(lambdaline:line.split(" "))`
	`10`	`+`
`9`	`11`	`wordCounts=words.countByValue()`
	`12`	`+`
`10`	`13`	`forword,countinwordCounts.items():`
`11`		`-print(word,count)`
	`14`	`+print("{} : {}".format(word,count))`
	`15`	`+`

`‎rdd/collect/CollectExample.py`

Lines changed: 9 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,15 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-sc=SparkContext("local","collect")`
`5`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("collect").setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
	`6`	`+`
`6`	`7`	`inputWords= ["spark","hadoop","spark","hive","pig","cassandra","hadoop"]`
	`8`	`+`
`7`	`9`	`wordRdd=sc.parallelize(inputWords)`
	`10`	`+`
`8`	`11`	`words=wordRdd.collect()`
	`12`	`+`
`9`	`13`	`forwordinwords:`
`10`		`-print(word)`
	`14`	`+print(word)`
	`15`	`+`

`‎rdd/count/CountExample.py`

Lines changed: 4 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,12 +1,13 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-sc=SparkContext("local","count")`
`5`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("count").setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
`6`	`6`	`inputWords= ["spark","hadoop","spark","hive","pig","cassandra","hadoop"]`
`7`	`7`	`wordRdd=sc.parallelize(inputWords)`
`8`	`8`	`print("Count: {}".format(wordRdd.count()))`
`9`	`9`	`worldCountByValue=wordRdd.countByValue()`
`10`	`10`	`print("CountByValue: ")`
`11`	`11`	`forword,countinworldCountByValue.items():`
`12`	`12`	`print("{} : {}".format(word,count))`
	`13`	`+`

`‎rdd/nasaApacheWebLogs/SameHostsSolution.py`

Lines changed: 3 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,8 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-sc=SparkContext("local","sameHosts")`
	`4`	`+conf=SparkConf().setAppName("sameHosts").setMaster("local[1]")`
	`5`	`+sc=SparkContext(conf=conf)`
`5`	`6`
`6`	`7`	`julyFirstLogs=sc.textFile("in/nasa_19950701.tsv")`
`7`	`8`	`augustFirstLogs=sc.textFile("in/nasa_19950801.tsv")`

`‎rdd/nasaApacheWebLogs/UnionLogSolutions.py`

Lines changed: 5 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,10 +1,11 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`defisNotHeader(line:str):`
`4`	`4`	`returnnot (line.startswith("host")and"bytes"inline)`
`5`	`5`
`6`	`6`	`if__name__=="__main__":`
`7`		`-sc=SparkContext("local","unionLogs")`
	`7`	`+conf=SparkConf().setAppName("unionLogs").setMaster("local[*]")`
	`8`	`+sc=SparkContext(conf=conf)`
`8`	`9`
`9`	`10`	`julyFirstLogs=sc.textFile("in/nasa_19950701.tsv")`
`10`	`11`	`augustFirstLogs=sc.textFile("in/nasa_19950801.tsv")`
`@@ -14,4 +15,5 @@ def isNotHeader(line: str):`
`14`	`15`	`cleanLogLines=aggregatedLogLines.filter(isNotHeader)`
`15`	`16`	`sample=cleanLogLines.sample(withReplacement=True,fraction=0.1)`
`16`	`17`
`17`		`-sample.saveAsTextFile("out/sample_nasa_logs.csv")`
	`18`	`+sample.saveAsTextFile("out/sample_nasa_logs.csv")`
	`19`	`+`

`‎rdd/persist/PersistExample.py`

Lines changed: 7 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,14 @@`
`1`		`-frompysparkimportSparkContext,StorageLevel`
	`1`	`+frompysparkimportSparkContext,SparkConf,StorageLevel`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-sc=SparkContext("local","persist")`
	`4`	`+conf=SparkConf().setAppName("persist").setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
	`6`	`+`
`5`	`7`	`inputIntegers= [1,2,3,4,5]`
`6`	`8`	`integerRdd=sc.parallelize(inputIntegers)`
	`9`	`+`
`7`	`10`	`integerRdd.persist(StorageLevel.MEMORY_ONLY)`
	`11`	`+`
`8`	`12`	`integerRdd.reduce(lambdax,y:x*y)`
	`13`	`+`
`9`	`14`	`integerRdd.count()`

`‎rdd/reduce/ReduceExample.py`

Lines changed: 5 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,11 @@`
`1`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`2`	`2`
`3`	`3`	`if__name__=="__main__":`
`4`		`-sc=SparkContext("local","reduce")`
`5`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("reduce").setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
	`6`	`+`
`6`	`7`	`inputIntegers= [1,2,3,4,5]`
`7`	`8`	`integerRdd=sc.parallelize(inputIntegers)`
	`9`	`+`
`8`	`10`	`product=integerRdd.reduce(lambdax,y:x*y)`
`9`	`11`	`print("product is :{}".format(product))`

`‎rdd/sumOfNumbers/SumOfNumbersSolution.py`

Lines changed: 9 additions & 6 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,12 +1,15 @@`
`1`		`-importsys`
`2`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`3`	`2`
`4`	`3`	`if__name__=="__main__":`
`5`		`-sc=SparkContext("local","primeNumbers")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("primeNumbers").setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
	`6`	`+`
`7`	`7`	`lines=sc.textFile("in/prime_nums.text")`
`8`	`8`	`numbers=lines.flatMap(lambdaline:line.split("\t"))`
	`9`	`+`
`9`	`10`	`validNumbers=numbers.filter(lambdanumber:number)`
	`11`	`+`
`10`	`12`	`intNumbers=validNumbers.map(lambdanumber:int(number))`
`11`		`-print("Sum is: ")`
`12`		`-print(intNumbers.reduce(lambdax,y:x+y))`
	`13`	`+`
	`14`	`+print("Sum is: {}".format(intNumbers.reduce(lambdax,y:x+y)))`
	`15`	`+`

`‎rdd/take/TakeExample.py`

Lines changed: 5 additions & 4 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,11 +1,12 @@`
`1`		`-importsys`
`2`		`-frompysparkimportSparkContext`
	`1`	`+frompysparkimportSparkContext,SparkConf`
`3`	`2`
`4`	`3`	`if__name__=="__main__":`
`5`		`-sc=SparkContext("local","take")`
`6`		`-sc.setLogLevel("ERROR")`
	`4`	`+conf=SparkConf().setAppName("take").setMaster("local[*]")`
	`5`	`+sc=SparkContext(conf=conf)`
	`6`	`+`
`7`	`7`	`inputWords= ["spark","hadoop","spark","hive","pig","cassandra","hadoop"]`
`8`	`8`	`wordRdd=sc.parallelize(inputWords)`
	`9`	`+`
`9`	`10`	`words=wordRdd.take(3)`
`10`	`11`	`forwordinwords:`
`11`	`12`	`print(word)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit76f5cce

File tree

14 files changed

14 files changed

`‎pairRdd/aggregation/reducebykey/WordCount.py`

`‎pairRdd/create/PairRddFromRegularRdd.py`

`‎pairRdd/create/PairRddFromTupleList.py`

`‎pairRdd/groupbykey/GroupByKeyVsReduceByKey.py`

`‎pairRdd/join/JoinOperations.py`

`‎rdd/WordCount.py`

`‎rdd/collect/CollectExample.py`

`‎rdd/count/CountExample.py`

`‎rdd/nasaApacheWebLogs/SameHostsSolution.py`

`‎rdd/nasaApacheWebLogs/UnionLogSolutions.py`

`‎rdd/persist/PersistExample.py`

`‎rdd/reduce/ReduceExample.py`

`‎rdd/sumOfNumbers/SumOfNumbersSolution.py`

`‎rdd/take/TakeExample.py`

0 commit comments