*** Dataset from https://ecologicaldata.org/wiki/georges-bank-plankton ***
pyspark --packages com.databricks:spark-csv_2.10:1.4.0
gbp_df = sqlCtx.load( source = "com.databricks.spark.csv", header = 'true', inferSchema = 'true', path = 'file:///home/cloudera/HackForTheSeaDatasets/GeorgesBank-Plankton.csv')
gbp_df.printSchema()
root |-- FID: string (nullable = true) |-- id: integer (nullable = true) |-- resource_id: integer (nullable = true) |-- resource_citation: string (nullable = true) |-- resname: string (nullable = true) |-- lifestage: string (nullable = true) |-- basisofrecord: string (nullable = true) |-- latitude: double (nullable = true) |-- longitude: double (nullable = true) |-- coordinateprecision: string (nullable = true) |-- datelastcached: string (nullable = true) |-- datecollected: string (nullable = true) |-- dateprecision: string (nullable = true) |-- datelastmodified: string (nullable = true) |-- yearcollected: integer (nullable = true) |-- monthcollected: integer (nullable = true) |-- daycollected: integer (nullable = true) |-- valid_id: integer (nullable = true) |-- valid_aphia_id: integer (nullable = true) |-- sname: string (nullable = true) |-- sauthor: string (nullable = true) |-- tname: string (nullable = true) |-- tauthor: string (nullable = true) |-- storedpath: string (nullable = true) |-- geom: string (nullable = true) |-- cs6m: string (nullable = true) |-- eez_id: integer (nullable = true) |-- lme_id: integer (nullable = true) |-- meow_id: integer (nullable = true) |-- iho_id: integer (nullable = true) |-- fao_id: integer (nullable = true) |-- mwhs_id: string (nullable = true) |-- depth: double (nullable = true) |-- depthprecision: double (nullable = true) |-- bottomdepth: integer (nullable = true) |-- display: integer (nullable = true) |-- woa_depth: integer (nullable = true) |-- temperature: double (nullable = true) |-- salinity: double (nullable = true) |-- nitrate: double (nullable = true) |-- oxygen: double (nullable = true) |-- phosphate: double (nullable = true) |-- silicate: double (nullable = true) |-- recordlastcached: string (nullable = true) |-- sourceofrecord: string (nullable = true) |-- recordcitation: string (nullable = true) |-- recordurl: string (nullable = true) |-- institutioncode: string (nullable = true) |-- collectioncode: string (nullable = true) |-- catalognumber: string (nullable = true) |-- collector: string (nullable = true) |-- startyearcollected: string (nullable = true) |-- endyearcollected: string (nullable = true) |-- startmonthcollected: string (nullable = true) |-- endmonthcollected: string (nullable = true) |-- startdaycollected: string (nullable = true) |-- enddaycollected: string (nullable = true) |-- starttimecollected: string (nullable = true) |-- endtimecollected: string (nullable = true) |-- julianday: string (nullable = true) |-- startjulianday: string (nullable = true) |-- endjulianday: string (nullable = true) |-- timeofday: string (nullable = true) |-- starttimeofday: double (nullable = true) |-- endtimeofday: double (nullable = true) |-- timezone: string (nullable = true) |-- locality: string (nullable = true) |-- ocean: string (nullable = true) |-- country: string (nullable = true) |-- state: string (nullable = true) |-- county: string (nullable = true) |-- cscode: string (nullable = true) |-- slatitude: string (nullable = true) |-- elatitude: string (nullable = true) |-- slongitude: string (nullable = true) |-- elongitude: string (nullable = true) |-- seprecision: string (nullable = true) |-- minimumdepth: integer (nullable = true) |-- maximumdepth: integer (nullable = true) |-- depthrange: string (nullable = true) |-- identifiedby: string (nullable = true) |-- yearidentified: string (nullable = true) |-- monthidentified: string (nullable = true) |-- dayidentified: string (nullable = true) |-- typestatus: string (nullable = true) |-- collectornumber: string (nullable = true) |-- fieldnumber: integer (nullable = true) |-- sex: string (nullable = true) |-- preparationtype: string (nullable = true) |-- individualcount: string (nullable = true) // no data and it's a string! |-- observedindividualcount: string (nullable = true) |-- observedweight: string (nullable = true) |-- samplesize: string (nullable = true) |-- previouscatalognumber: string (nullable = true) |-- relationshiptype: string (nullable = true) |-- relatedcatalogitem: string (nullable = true) |-- notes: string (nullable = true) |-- concatenated: string (nullable = true) |-- scientificnameid: string (nullable = true)
gbp_df.count()
15305L
gbp_df.groupBy("yearcollected").count().show()
yearcollected count 1995 2582 1996 2419 1997 2767 1998 3836 1999 3701
gbp_df.groupBy("yearcollected").max("depth").show()
yearcollected MAX(depth#32)
1995 248.5
1996 347.0
1997 301.0
1998 351.5
1999 350.0
gbp_df.groupBy("yearcollected").min("depth").show()
yearcollected MIN(depth#32)
1995 6.5
1996 5.5
1997 0.5
1998 7.0
1999 7.0
gbp_df.groupBy("yearcollected").avg("depth").show()
yearcollected AVG(depth#32)
1995 48.074167312161116
1996 52.39561802397685
1997 52.907842428623056
1998 50.01629301355579
1999 51.749797352067006
gbp_df.groupBy("yearcollected").max("temperature").show()
yearcollected MAX(temperature#37)
1995 14.86800003
1996 14.86800003
1997 14.86800003
1998 14.86800003
1999 14.86800003
gbp_df.groupBy("yearcollected").min("temperature").show()
yearcollected MIN(temperature#37)
1995 8.75399971
1996 8.33600044
1997 7.14799976
1998 8.33600044
1999 8.33600044
gbp_df.groupBy("yearcollected").avg("temperature").show()
yearcollected AVG(temperature#37) 1995 11.727328003847381 1996 11.187256990699158 1997 11.435583872458974 1998 11.469532915705418 1999 11.22900756048172
gbp_df.groupBy("yearcollected").max("salinity").show()
yearcollected MAX(salinity#38)
1995 34.0929985
1996 34.0929985
1997 34.0929985
1998 34.0929985
1999 34.0929985
gbp_df.groupBy("yearcollected").min("salinity").show()
yearcollected MIN(salinity#38)
1995 32.29700089
1996 32.29700089
1997 32.29700089
1998 32.29700089
1999 32.29700089
gbp_df.groupBy("yearcollected").avg("salinity").show()
yearcollected AVG(salinity#38)
1995 33.203984438603925
1996 33.0961503668455
1997 33.14560471813368
1998 33.14066764916729
1999 33.06387653692713
gbp_df.groupBy("yearcollected").max("nitrate").show()
yearcollected MAX(nitrate#39)
1995 5.76800013
1996 6.25400019
1997 6.4460001
1998 6.25400019
1999 6.25400019
gbp_df.groupBy("yearcollected").min("nitrate").show()
yearcollected MIN(nitrate#39)
1995 2.00999999
1996 2.00999999
1997 2.00999999
1998 2.00999999
1999 2.00999999
gbp_df.groupBy("yearcollected").avg("nitrate").show()
yearcollected AVG(nitrate#39)
1995 2.6463798011526176
1996 2.9115007408130253
1997 2.798561096658278
1998 2.767145424312006
1999 2.7492793750535376
gbp_df.groupBy("yearcollected").max("oxygen").show()
yearcollected MAX(oxygen#40)
1995 6.85200024
1996 6.85200024
1997 6.85200024
1998 6.85200024
1999 6.85200024
gbp_df.groupBy("yearcollected").min("oxygen").show()
yearcollected MIN(oxygen#40)
1995 5.99399996
1996 5.99399996
1997 5.99399996
1998 5.99399996
1999 5.99399996
gbp_df.groupBy("yearcollected").avg("oxygen").show()
yearcollected AVG(oxygen#40)
1995 6.417204503051942
1996 6.4640080554796615
1997 6.442346750403555
1998 6.446340021253326
1999 6.483504245481837
gbp_df.groupBy("yearcollected").max("phosphate").show()
yearcollected MAX(phosphate#41)
1995 0.64499998
1996 0.69400001
1997 0.74199998
1998 0.69400001
1999 0.69400001
gbp_df.groupBy("yearcollected").min("phosphate").show()
yearcollected MIN(phosphate#41)
1995 0.333
1996 0.333
1997 0.333
1998 0.333
1999 0.333
gbp_df.groupBy("yearcollected").avg("phosphate").show()
yearcollected AVG(phosphate#41)
1995 0.4401639585876598
1996 0.4669837356422753
1997 0.45645649176544795
1998 0.45527870073619864
1999 0.45688757658458357
gbp_df.groupBy("yearcollected").max("silicate").show()
yearcollected MAX(silicate#42)
1995 4.12300014
1996 4.4380002
1997 4.59700012
1998 4.4380002
1999 4.4380002
gbp_df.groupBy("yearcollected").min("silicate").show()
yearcollected MIN(silicate#42)
1995 2.03299999
1996 2.03299999
1997 2.03299999
1998 2.03299999
1999 2.03299999
gbp_df.groupBy("yearcollected").avg("silicate").show()
yearcollected AVG(silicate#42)
1995 2.475397710633123
1996 2.6313642255121956
1997 2.571669603039107
1998 2.5586932499386843
1999 2.5577366064025964
*** Dataset from httP://pubs.usgs.gov/of/2003/og03-001/data/seddata/wigley61/ ***
gbbs_df = sqlCtx.load( source = "com.databricks.spark.csv", header = 'true', inferSchema = 'true', path = 'file:///home/cloudera/HackForTheSeaDatasets/GeorgesBank-BottomSediments.csv')
gbbs_df.printSchema()
root |-- PROJECT_ID: string (nullable = true) |-- VESSEL: string (nullable = true) |-- DATE: string (nullable = true) |-- STATION_ID: integer (nullable = true) |-- DEVICE: string (nullable = true) |-- LATITUDE: double (nullable = true) |-- LONGITUDE: double (nullable = true) |-- TEXTURE: string (nullable = true)
gbbs_df.count()
223L
gbp_df.registerTempTable("plankton") gbbs_df.registerTempTable("sediment")
combined_df = sqlCtx.sql("SELECT AVG(temperature), AVG(salinity), AVG(nitrate), AVG(oxygen), AVG(phosphate), AVG(silicate), sediment.TEXTURE FROM plankton INNER JOIN sediment ON plankton.latitude = sediment.LATITUDE AND plankton.longitude = sediment.LONGITUDE GROUP BY sediment.TEXTURE") combined_df.printSchema()
root |-- _c0: double (nullable = true) |-- _c1: double (nullable = true) |-- _c2: double (nullable = true) |-- _c3: double (nullable = true) |-- _c4: double (nullable = true) |-- _c5: double (nullable = true) |-- TEXTURE: string (nullable = true)
combined_df.show()
_c0 _c1 _c2 _c3 _c4 _c5 TEXTURE // no matches
Built With
- apache-spark
Log in or sign up for Devpost to join the conversation.