- Notifications
You must be signed in to change notification settings - Fork302
Scala to Python - pairRdd folder#4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
4f0a014
56aa850
244fe9b
d6b58da
a2068e7
c6a3a82
7bcccc9
43f7883
fb3dfcb
ee71add
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "AverageHousePrice") | ||
sc.setLogLevel("ERROR") | ||
lines = sc.textFile("in/RealEstate.csv") | ||
cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) | ||
housePricePairRdd = cleanedLines.map(lambda line: (line.split(",")[3], float(line.split(",")[2]))) | ||
createCombiner = lambda x: (1, x) | ||
mergeValue = lambda avgCount, x: (avgCount[0] + 1, avgCount[1] + x) | ||
mergeCombiners = lambda avgCountA, avgCountB: (avgCountA[0] + avgCountB[0], avgCountA[1] + avgCountB[1]) | ||
housePriceTotal = housePricePairRdd.combineByKey(createCombiner, mergeValue, mergeCombiners) | ||
housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0]) | ||
for bedrooms, avgPrice in housePriceAvg.collect(): | ||
print("{} : {}".format(bedrooms, avgPrice)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "wordCounts") | ||
sc.setLogLevel("ERROR") | ||
lines = sc.textFile("in/word_count.text") | ||
wordRdd = lines.flatMap(lambda line: line.split(" ")) | ||
wordPairRdd = wordRdd.map(lambda word: (word, 1)) | ||
wordCounts = wordPairRdd.reduceByKey(lambda x, y: x + y) | ||
for word, count in wordCounts.collect(): | ||
print("{} : {}".format(word, count)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "avgHousePrice") | ||
sc.setLogLevel("ERROR") | ||
lines = sc.textFile("in/RealEstate.csv") | ||
cleanedLines = lines.filter(lambda line: "Bedrooms" not in line) | ||
housePricePairRdd = cleanedLines.map(lambda line: \ | ||
(line.split(",")[3], (1, float(line.split(",")[2])))) | ||
housePriceTotal = housePricePairRdd \ | ||
.reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) | ||
print("housePriceTotal: ") | ||
for bedroom, total in housePriceTotal.collect(): | ||
print("{} : {}".format(bedroom, total)) | ||
housePriceAvg = housePriceTotal.mapValues(lambda avgCount: avgCount[1] / avgCount[0]) | ||
print("\nhousePriceAvg: ") | ||
for bedroom, avg in housePriceAvg.collect(): | ||
print("{} : {}".format(bedroom, avg)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
class AvgCount(): | ||
def __init__(self, count: int, total: float): | ||
self.count = count | ||
self.total = total | ||
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "create") | ||
sc.setLogLevel("ERROR") | ||
inputStrings = ["Lily 23", "Jack 29", "Mary 29", "James 8"] | ||
regularRDDs = sc.parallelize(inputStrings) | ||
pairRDD = regularRDDs.map(lambda s: (s.split(" ")[0], s.split(" ")[1])) | ||
pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_regular_rdd") |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "create") | ||
sc.setLogLevel("ERROR") | ||
tuples = [("Lily", 23), ("Jack", 29), ("Mary", 29), ("James", 8)] | ||
pairRDD = sc.parallelize(tuples) | ||
pairRDD.coalesce(1).saveAsTextFile("out/pair_rdd_from_tuple_list") |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
''' | ||
Create a Spark program to read the airport data from in/airports.text; | ||
generate a pair RDD with airport name being the key and country name being the value. | ||
Then remove all the airports which are located in United States and output the pair RDD to out/airports_not_in_usa_pair_rdd.text | ||
Each row of the input file contains the following columns: | ||
Airport ID, Name of airport, Main city served by airport, Country where airport is located, | ||
IATA/FAA code, ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format | ||
Sample output: | ||
("Kamloops", "Canada") | ||
("Wewak Intl", "Papua New Guinea") | ||
... | ||
''' |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
from pyspark import SparkContext | ||
from commons.Utils import Utils | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "airports") | ||
sc.setLogLevel("ERROR") | ||
airportsRDD = sc.textFile("in/airports.text") | ||
airportPairRDD = airportsRDD.map(lambda line: \ | ||
(Utils.COMMA_DELIMITER.split(line)[1], | ||
Utils.COMMA_DELIMITER.split(line)[3])) | ||
airportsNotInUSA = airportPairRDD.filter(lambda keyValue: keyValue[1] != "\"United States\"") | ||
airportsNotInUSA.saveAsTextFile("out/airports_not_in_usa_pair_rdd.text") |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
from pyspark import SparkContext | ||
if __name__ == "__main__": | ||
''' | ||
Create a Spark program to read the airport data from in/airports.text, | ||
output the the list of the names of the airports located in each country. | ||
Each row of the input file contains the following columns: | ||
Airport ID, Name of airport, Main city served by airport, Country where airport is located, IATA/FAA code, | ||
ICAO Code, Latitude, Longitude, Altitude, Timezone, DST, Timezone in Olson format | ||
Sample output: | ||
"Canada", ["Bagotville", "Montreal", "Coronation", ...] | ||
"Norway" : ["Vigra", "Andenes", "Alta", "Bomoen", "Bronnoy",..] | ||
"Papua New Guinea", ["Goroka", "Madang", ...] | ||
... | ||
''' | ||
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from pyspark import SparkContext | ||
from commons.Utils import Utils | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "airports") | ||
sc.setLogLevel("ERROR") | ||
lines = sc.textFile("in/airports.text") | ||
countryAndAirportNameAndPair = lines.map(lambda airport:\ | ||
(Utils.COMMA_DELIMITER.split(airport)[3], | ||
Utils.COMMA_DELIMITER.split(airport)[1])) | ||
airportsByCountry = countryAndAirportNameAndPair.groupByKey() | ||
for country, airportName in airportsByCountry.collectAsMap().items(): | ||
print("{}: {}".format(country,list(airportName))) |
Uh oh!
There was an error while loading.Please reload this page.