1 people like it.

Course 3: Exploring Titanic dataset

F# introduction course - Getting data about Titanic passengers using CSV type provider and analyzing them using standard sequence-processing functions known from LINQ. To be used in Try F#.

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
57: 
58: 
59: 
60: 
61: 
62: 
63: 
64: 
65: 
66: 
67: 
68: 
69: 
70: 
71: 
72: 
73: 
74: 
75: 
76: 
77: 
78: 
79: 
80: 
81: 
82: 
83: 
84: 
85: 
86: 
87: 
88: 
89: 
90: 
91: 
92: 
93: 
94: 
95: 
// Load type provider for CSV files
#r "Samples.Csv.dll"
open Samples.Csv

// ------------------------------------------------------------------
// TUTORIAL: Parsing and exploring the Titanic CSV data set 
// ------------------------------------------------------------------

// Download data from the web, use CSV provider to infer colum names
let [<Literal>] DataUrl = 
  "https://gist.github.com/tpetricek/263fb1bee9366170b2ef/raw/90d012bac3713e8618d3ae2f83f2f6535b6bebd9/titanic.csv"  
type Titanic = CsvFile<DataUrl, Schema="int,int,int,string,string,string,string,string,string,string,string,string">

// Load & explore the data from the web URL
let data = new Titanic()
let first = data.Data |> Seq.head

first.Name
first.Age

// Print names of surviving children
// (Note - the value of age may be missing, or silly)
for row in data.Data do
  if row.Survived = 1 && row.Age <> "" && (float row.Age) < 18.0 then
    printfn "%s (%s)" row.Name row.Age

// TASK #1: Skip suspicious floating point values 
// (You can use Contains member method to test for "."
// or you can look for values less than 1)

// TASK #2: Print names of surviving males 
// who have name longer than 40 characters


// ------------------------------------------------------------------
// TUTORIAL: Introdcing higher-order, first-class functions & collections 
// ------------------------------------------------------------------

// Helper functions that extract information from a row 
let survived (row:Titanic.Row) = 
  row.Survived = 1
  
let name (row:Titanic.Row) = 
  row.Name

let hasAge (row:Titanic.Row) = 
  (row.Age <> "") && (not (row.Age.Contains(".")))
     
let age (row:Titanic.Row) = 
  float row.Age 

// Call them on the first line
name first
hasAge first
age first

// Seq.* functions can be used to implement LINQ-like queries
// For example, get a sequence of names: 
Seq.map name data.Data

// Get count of passangers & average age on Titanic
Seq.length data.Data
Seq.average (Seq.map age (Seq.filter hasAge data.Data))

// Nicer notation using the pipelining operator
data.Data
|> Seq.filter hasAge
|> Seq.map age
|> Seq.average

// Or we can use lambda functions, which makes things easier
data.Data
|> Seq.filter (fun r -> r.Age <> "" && not (r.Age.Contains(".")))
|> Seq.averageBy (fun r -> float r.Age)

// TASK #3: Find out whether the average age of those who survived
// is greater/smaller than the average age of those who died

// ------------------------------------------------------------------
// TUTORIAL: More things to try on your own!
// ------------------------------------------------------------------

// Calculate the percentage of survivors by different embarkation point
data.Data
|> Seq.groupBy (fun row -> row.Embarked)
|> Seq.map (fun (embarked, data) ->
     let survivors =
       data |> Seq.filter (fun r -> r.Survived = 1)
            |> Seq.length
     let total = data |> Seq.length                 
     embarked, float survivors / float total * 100.0)

// TASK  #4: Calculate average age by different embarkation point
// (Use Seq.groupBy as above and then use Seq.averageBy on the 
// group 'data' as above to get average age)     
Multiple items
type LiteralAttribute =
  inherit Attribute
  new : unit -> LiteralAttribute

Full name: Microsoft.FSharp.Core.LiteralAttribute

--------------------
new : unit -> LiteralAttribute
val DataUrl : string

Full name: Script.DataUrl
type Titanic = obj

Full name: Script.Titanic
val data : Titanic

Full name: Script.data
val first : obj

Full name: Script.first
namespace Microsoft.FSharp.Data
module Seq

from Microsoft.FSharp.Collections
val head : source:seq<'T> -> 'T

Full name: Microsoft.FSharp.Collections.Seq.head
val row : obj
Multiple items
val float : value:'T -> float (requires member op_Explicit)

Full name: Microsoft.FSharp.Core.Operators.float

--------------------
type float = System.Double

Full name: Microsoft.FSharp.Core.float

--------------------
type float<'Measure> = float

Full name: Microsoft.FSharp.Core.float<_>
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
val survived : row:'a -> bool

Full name: Script.survived
val row : 'a
val name : row:'a -> 'b

Full name: Script.name
val hasAge : row:'a -> bool

Full name: Script.hasAge
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
val age : row:'a -> float

Full name: Script.age
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
val length : source:seq<'T> -> int

Full name: Microsoft.FSharp.Collections.Seq.length
val average : source:seq<'T> -> 'T (requires member ( + ) and member DivideByInt and member get_Zero)

Full name: Microsoft.FSharp.Collections.Seq.average
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
val r : obj
val averageBy : projection:('T -> 'U) -> source:seq<'T> -> 'U (requires member ( + ) and member DivideByInt and member get_Zero)

Full name: Microsoft.FSharp.Collections.Seq.averageBy
val groupBy : projection:('T -> 'Key) -> source:seq<'T> -> seq<'Key * seq<'T>> (requires equality)

Full name: Microsoft.FSharp.Collections.Seq.groupBy
val embarked : obj
val data : seq<obj>
val survivors : int
val total : int
Raw view Test code New version

More information

Link:http://fssnip.net/jb
Posted:11 years ago
Author:Tomas Petricek
Tags: try f# , collections , sequences , csv , higher-order functions