- Notifications
You must be signed in to change notification settings - Fork302
Scala to Python - advanced folder#3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.
Already on GitHub?Sign in to your account
Uh oh!
There was an error while loading.Please reload this page.
Changes fromall commits
File filter
Filter by extension
Conversations
Uh oh!
There was an error while loading.Please reload this page.
Jump to
Uh oh!
There was an error while loading.Please reload this page.
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from pyspark import SparkContext | ||
from commons.Utils import Utils | ||
def filterResponseFromCanada(response, total, missingSalaryMidPoint): | ||
splits = Utils.COMMA_DELIMITER.split(response) | ||
total.add(1) | ||
if not splits[14]: | ||
missingSalaryMidPoint.add(1) | ||
return splits[2] == "Canada" | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "StackOverFlowSurvey") | ||
sc.setLogLevel("ERROR") | ||
total = sc.accumulator(0) | ||
missingSalaryMidPoint = sc.accumulator(0) | ||
responseRDD = sc.textFile("in/2016-stack-overflow-survey-responses.csv") | ||
responseFromCanada = responseRDD.filter(lambda response: \ | ||
filterResponseFromCanada(response, total, missingSalaryMidPoint)) | ||
print("Count of responses from Canada: {}".format(responseFromCanada.count())) | ||
print("Total count of responses: {}".format(total.value)) | ||
print("Count of responses missing salary middle point: {}".format(missingSalaryMidPoint.value)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from pyspark import SparkContext | ||
from commons.Utils import Utils | ||
def filterResponseFromCanada(response, total, missingSalaryMidPoint, processedBytes): | ||
processedBytes.add(len(response.encode('utf-8'))) | ||
splits = Utils.COMMA_DELIMITER.split(response) | ||
total.add(1) | ||
if not splits[14]: | ||
missingSalaryMidPoint.add(1) | ||
return splits[2] == "Canada" | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "StackOverFlowSurvey") | ||
sc.setLogLevel("ERROR") | ||
total = sc.accumulator(0) | ||
missingSalaryMidPoint = sc.accumulator(0) | ||
processedBytes = sc.accumulator(0) | ||
responseRDD = sc.textFile("in/2016-stack-overflow-survey-responses.csv") | ||
responseFromCanada = responseRDD.filter(lambda response: \ | ||
filterResponseFromCanada(response, total, missingSalaryMidPoint, processedBytes)) | ||
print("Count of responses from Canada: {}".format(responseFromCanada.count())) | ||
print("Number of bytes processed: {}".format(processedBytes.value)) | ||
print("Total count of responses: {}".format(total.value)) | ||
print("Count of responses missing salary middle point: {}".format(missingSalaryMidPoint.value)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from pyspark import SparkContext | ||
from commons.Utils import Utils | ||
def getPostPrefix(line: str): | ||
splits = Utils.COMMA_DELIMITER.split(line) | ||
postcode = splits[4] | ||
return None if not postcode else postcode.split(" ")[0] | ||
def loadPostCodeMap(): | ||
lines = open("in/uk-postcode.csv", "r").read().split("\n") | ||
splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""] | ||
return {splits[0]: splits[7] for splits in splitsForLines} | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "UkMakerSpaces") | ||
sc.setLogLevel("ERROR") | ||
postCodeMap = sc.broadcast(loadPostCodeMap()) | ||
makerSpaceRdd = sc.textFile("in/uk-makerspaces-identifiable-data.csv") | ||
regions = makerSpaceRdd \ | ||
.filter(lambda line: Utils.COMMA_DELIMITER.split(line)[0] != "Timestamp") \ | ||
.filter(lambda line: getPostPrefix(line) is not None) \ | ||
.map(lambda line: postCodeMap.value[getPostPrefix(line)] \ | ||
if getPostPrefix(line) in postCodeMap.value else "Unknow") | ||
for region, count in regions.countByValue().items(): | ||
print("{} : {}".format(region, count)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from pyspark import SparkContext | ||
from commons.Utils import Utils | ||
def getPostPrefixes(line: str): | ||
postcode = Utils.COMMA_DELIMITER.split(line)[4] | ||
cleanedPostCode = postcode.replace("\\s+", "") | ||
return [cleanedPostCode[0:i] for i in range(0,len(cleanedPostCode)+1)] | ||
def loadPostCodeMap(): | ||
lines = open("in/uk-postcode.csv", "r").read().split("\n") | ||
splitsForLines = [Utils.COMMA_DELIMITER.split(line) for line in lines if line != ""] | ||
return {splits[0]: splits[7] for splits in splitsForLines} | ||
if __name__ == "__main__": | ||
sc = SparkContext("local", "UkMakerSpaces") | ||
sc.setLogLevel("ERROR") | ||
postCodeMap = loadPostCodeMap() | ||
makerSpaceRdd = sc.textFile("in/uk-makerspaces-identifiable-data.csv") | ||
regions = makerSpaceRdd \ | ||
.filter(lambda line: Utils.COMMA_DELIMITER.split(line)[0] != "Timestamp") \ | ||
.map(lambda line: next((postCodeMap[prefix] for prefix in getPostPrefixes(line) \ | ||
if prefix in postCodeMap), "Unknow")) | ||
for region, count in regions.countByValue().items(): | ||
print("{} : {}".format(region, count)) |
This file was deleted.
Uh oh!
There was an error while loading.Please reload this page.