*** Dataset from https://ecologicaldata.org/wiki/georges-bank-plankton ***

pyspark --packages com.databricks:spark-csv_2.10:1.4.0

gbp_df = sqlCtx.load( source = "com.databricks.spark.csv", header = 'true', inferSchema = 'true', path = 'file:///home/cloudera/HackForTheSeaDatasets/GeorgesBank-Plankton.csv')

gbp_df.printSchema()

root |-- FID: string (nullable = true) |-- id: integer (nullable = true) |-- resource_id: integer (nullable = true) |-- resource_citation: string (nullable = true) |-- resname: string (nullable = true) |-- lifestage: string (nullable = true) |-- basisofrecord: string (nullable = true) |-- latitude: double (nullable = true) |-- longitude: double (nullable = true) |-- coordinateprecision: string (nullable = true) |-- datelastcached: string (nullable = true) |-- datecollected: string (nullable = true) |-- dateprecision: string (nullable = true) |-- datelastmodified: string (nullable = true) |-- yearcollected: integer (nullable = true) |-- monthcollected: integer (nullable = true) |-- daycollected: integer (nullable = true) |-- valid_id: integer (nullable = true) |-- valid_aphia_id: integer (nullable = true) |-- sname: string (nullable = true) |-- sauthor: string (nullable = true) |-- tname: string (nullable = true) |-- tauthor: string (nullable = true) |-- storedpath: string (nullable = true) |-- geom: string (nullable = true) |-- cs6m: string (nullable = true) |-- eez_id: integer (nullable = true) |-- lme_id: integer (nullable = true) |-- meow_id: integer (nullable = true) |-- iho_id: integer (nullable = true) |-- fao_id: integer (nullable = true) |-- mwhs_id: string (nullable = true) |-- depth: double (nullable = true) |-- depthprecision: double (nullable = true) |-- bottomdepth: integer (nullable = true) |-- display: integer (nullable = true) |-- woa_depth: integer (nullable = true) |-- temperature: double (nullable = true) |-- salinity: double (nullable = true) |-- nitrate: double (nullable = true) |-- oxygen: double (nullable = true) |-- phosphate: double (nullable = true) |-- silicate: double (nullable = true) |-- recordlastcached: string (nullable = true) |-- sourceofrecord: string (nullable = true) |-- recordcitation: string (nullable = true) |-- recordurl: string (nullable = true) |-- institutioncode: string (nullable = true) |-- collectioncode: string (nullable = true) |-- catalognumber: string (nullable = true) |-- collector: string (nullable = true) |-- startyearcollected: string (nullable = true) |-- endyearcollected: string (nullable = true) |-- startmonthcollected: string (nullable = true) |-- endmonthcollected: string (nullable = true) |-- startdaycollected: string (nullable = true) |-- enddaycollected: string (nullable = true) |-- starttimecollected: string (nullable = true) |-- endtimecollected: string (nullable = true) |-- julianday: string (nullable = true) |-- startjulianday: string (nullable = true) |-- endjulianday: string (nullable = true) |-- timeofday: string (nullable = true) |-- starttimeofday: double (nullable = true) |-- endtimeofday: double (nullable = true) |-- timezone: string (nullable = true) |-- locality: string (nullable = true) |-- ocean: string (nullable = true) |-- country: string (nullable = true) |-- state: string (nullable = true) |-- county: string (nullable = true) |-- cscode: string (nullable = true) |-- slatitude: string (nullable = true) |-- elatitude: string (nullable = true) |-- slongitude: string (nullable = true) |-- elongitude: string (nullable = true) |-- seprecision: string (nullable = true) |-- minimumdepth: integer (nullable = true) |-- maximumdepth: integer (nullable = true) |-- depthrange: string (nullable = true) |-- identifiedby: string (nullable = true) |-- yearidentified: string (nullable = true) |-- monthidentified: string (nullable = true) |-- dayidentified: string (nullable = true) |-- typestatus: string (nullable = true) |-- collectornumber: string (nullable = true) |-- fieldnumber: integer (nullable = true) |-- sex: string (nullable = true) |-- preparationtype: string (nullable = true) |-- individualcount: string (nullable = true) // no data and it's a string! |-- observedindividualcount: string (nullable = true) |-- observedweight: string (nullable = true) |-- samplesize: string (nullable = true) |-- previouscatalognumber: string (nullable = true) |-- relationshiptype: string (nullable = true) |-- relatedcatalogitem: string (nullable = true) |-- notes: string (nullable = true) |-- concatenated: string (nullable = true) |-- scientificnameid: string (nullable = true)

gbp_df.count()

15305L

gbp_df.groupBy("yearcollected").count().show()

yearcollected count 1995 2582 1996 2419 1997 2767 1998 3836 1999 3701

gbp_df.groupBy("yearcollected").max("depth").show()

yearcollected MAX(depth#32) 1995 248.5
1996 347.0
1997 301.0
1998 351.5
1999 350.0

gbp_df.groupBy("yearcollected").min("depth").show()

yearcollected MIN(depth#32) 1995 6.5
1996 5.5
1997 0.5
1998 7.0
1999 7.0

gbp_df.groupBy("yearcollected").avg("depth").show()

yearcollected AVG(depth#32)
1995 48.074167312161116 1996 52.39561802397685 1997 52.907842428623056 1998 50.01629301355579 1999 51.749797352067006

gbp_df.groupBy("yearcollected").max("temperature").show()

yearcollected MAX(temperature#37) 1995 14.86800003
1996 14.86800003
1997 14.86800003
1998 14.86800003
1999 14.86800003

gbp_df.groupBy("yearcollected").min("temperature").show()

yearcollected MIN(temperature#37) 1995 8.75399971
1996 8.33600044
1997 7.14799976
1998 8.33600044
1999 8.33600044

gbp_df.groupBy("yearcollected").avg("temperature").show()

yearcollected AVG(temperature#37) 1995 11.727328003847381 1996 11.187256990699158 1997 11.435583872458974 1998 11.469532915705418 1999 11.22900756048172

gbp_df.groupBy("yearcollected").max("salinity").show()

yearcollected MAX(salinity#38) 1995 34.0929985
1996 34.0929985
1997 34.0929985
1998 34.0929985
1999 34.0929985

gbp_df.groupBy("yearcollected").min("salinity").show()

yearcollected MIN(salinity#38) 1995 32.29700089
1996 32.29700089
1997 32.29700089
1998 32.29700089
1999 32.29700089

gbp_df.groupBy("yearcollected").avg("salinity").show()

yearcollected AVG(salinity#38)
1995 33.203984438603925 1996 33.0961503668455
1997 33.14560471813368 1998 33.14066764916729 1999 33.06387653692713

gbp_df.groupBy("yearcollected").max("nitrate").show()

yearcollected MAX(nitrate#39) 1995 5.76800013
1996 6.25400019
1997 6.4460001
1998 6.25400019
1999 6.25400019

gbp_df.groupBy("yearcollected").min("nitrate").show()

yearcollected MIN(nitrate#39) 1995 2.00999999
1996 2.00999999
1997 2.00999999
1998 2.00999999
1999 2.00999999

gbp_df.groupBy("yearcollected").avg("nitrate").show()

yearcollected AVG(nitrate#39)
1995 2.6463798011526176 1996 2.9115007408130253 1997 2.798561096658278 1998 2.767145424312006 1999 2.7492793750535376

gbp_df.groupBy("yearcollected").max("oxygen").show()

yearcollected MAX(oxygen#40) 1995 6.85200024
1996 6.85200024
1997 6.85200024
1998 6.85200024
1999 6.85200024

gbp_df.groupBy("yearcollected").min("oxygen").show()

yearcollected MIN(oxygen#40) 1995 5.99399996
1996 5.99399996
1997 5.99399996
1998 5.99399996
1999 5.99399996

gbp_df.groupBy("yearcollected").avg("oxygen").show()

yearcollected AVG(oxygen#40)
1995 6.417204503051942 1996 6.4640080554796615 1997 6.442346750403555 1998 6.446340021253326 1999 6.483504245481837

gbp_df.groupBy("yearcollected").max("phosphate").show()

yearcollected MAX(phosphate#41) 1995 0.64499998
1996 0.69400001
1997 0.74199998
1998 0.69400001
1999 0.69400001

gbp_df.groupBy("yearcollected").min("phosphate").show()

yearcollected MIN(phosphate#41) 1995 0.333
1996 0.333
1997 0.333
1998 0.333
1999 0.333

gbp_df.groupBy("yearcollected").avg("phosphate").show()

yearcollected AVG(phosphate#41)
1995 0.4401639585876598 1996 0.4669837356422753 1997 0.45645649176544795 1998 0.45527870073619864 1999 0.45688757658458357

gbp_df.groupBy("yearcollected").max("silicate").show()

yearcollected MAX(silicate#42) 1995 4.12300014
1996 4.4380002
1997 4.59700012
1998 4.4380002
1999 4.4380002

gbp_df.groupBy("yearcollected").min("silicate").show()

yearcollected MIN(silicate#42) 1995 2.03299999
1996 2.03299999
1997 2.03299999
1998 2.03299999
1999 2.03299999

gbp_df.groupBy("yearcollected").avg("silicate").show()

yearcollected AVG(silicate#42)
1995 2.475397710633123 1996 2.6313642255121956 1997 2.571669603039107 1998 2.5586932499386843 1999 2.5577366064025964

*** Dataset from httP://pubs.usgs.gov/of/2003/og03-001/data/seddata/wigley61/ ***

gbbs_df = sqlCtx.load( source = "com.databricks.spark.csv", header = 'true', inferSchema = 'true', path = 'file:///home/cloudera/HackForTheSeaDatasets/GeorgesBank-BottomSediments.csv')

gbbs_df.printSchema()

root |-- PROJECT_ID: string (nullable = true) |-- VESSEL: string (nullable = true) |-- DATE: string (nullable = true) |-- STATION_ID: integer (nullable = true) |-- DEVICE: string (nullable = true) |-- LATITUDE: double (nullable = true) |-- LONGITUDE: double (nullable = true) |-- TEXTURE: string (nullable = true)

gbbs_df.count()

223L

gbp_df.registerTempTable("plankton") gbbs_df.registerTempTable("sediment")

combined_df = sqlCtx.sql("SELECT AVG(temperature), AVG(salinity), AVG(nitrate), AVG(oxygen), AVG(phosphate), AVG(silicate), sediment.TEXTURE FROM plankton INNER JOIN sediment ON plankton.latitude = sediment.LATITUDE AND plankton.longitude = sediment.LONGITUDE GROUP BY sediment.TEXTURE") combined_df.printSchema()

root |-- _c0: double (nullable = true) |-- _c1: double (nullable = true) |-- _c2: double (nullable = true) |-- _c3: double (nullable = true) |-- _c4: double (nullable = true) |-- _c5: double (nullable = true) |-- TEXTURE: string (nullable = true)

combined_df.show()

_c0 _c1 _c2 _c3 _c4 _c5 TEXTURE // no matches

Built With

  • apache-spark
Share this project:

Updates