2 people like it.

Extract script blocks from html page (FP)

A bit more functional version of this: http://fssnip.net/iR

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
open HtmlAgilityPack
open System.Text.RegularExpressions
open System.IO
open System
open System.Text.RegularExpressions
open FSharpx
open FSharpx.Choice

module JsRetriever = 
    let stripHtml text = 
        ["<script\s*"
         "\"?\s*type\s*=\s*\"\s*text/javascript\s*\"\s*"
         "</script>"
         "src\s*=\s*"
         "\""
         ">"
         "</"
         "<"]
        |> List.fold (fun res pattern -> Regex.Replace(res, pattern, "").Trim()) text
    
    let convertToAbsolute parent path = 
        Path.Combine (Path.GetDirectoryName (parent), path) |> Path.GetFullPath
    
    let endsOn ext file = Path.GetExtension(file) = ext

    let getJsFiles (defaultAspxPath: string) = 
        let doc = HtmlDocument()
        doc.Load defaultAspxPath
        doc.DocumentNode.SelectNodes "/html/head/script/@src"
        |> Seq.map (fun x -> x.OuterHtml)
        |> Seq.map (Choice.protect stripHtml >=> Choice.protect (convertToAbsolute defaultAspxPath))
        |> Seq.fold (fun (files, es) -> 
            Choice.choice
                (fun f -> f :: files, es)
                (fun e -> files, e :: es)) ([], [])
        |> fun (files, es) ->
            es |> List.fold (fun acc e -> sprintf "%s, %O" acc e) "" |> printfn "%s"
            files |> Seq.filter (endsOn ".js")
namespace HtmlAgilityPack
namespace System
namespace System.Text
namespace System.Text.RegularExpressions
namespace System.IO
namespace FSharpx
module Choice

from FSharpx
module JsRetriever

from Script
val stripHtml : text:string -> string

Full name: Script.JsRetriever.stripHtml
val text : string
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val fold : folder:('State -> 'T -> 'State) -> state:'State -> list:'T list -> 'State

Full name: Microsoft.FSharp.Collections.List.fold
val res : string
val pattern : string
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator) : string
Regex.Replace(input: string, pattern: string, replacement: string) : string
Regex.Replace(input: string, pattern: string, evaluator: MatchEvaluator, options: RegexOptions) : string
Regex.Replace(input: string, pattern: string, replacement: string, options: RegexOptions) : string
val convertToAbsolute : parent:string -> path:string -> string

Full name: Script.JsRetriever.convertToAbsolute
val parent : string
val path : string
type Path =
  static val DirectorySeparatorChar : char
  static val AltDirectorySeparatorChar : char
  static val VolumeSeparatorChar : char
  static val InvalidPathChars : char[]
  static val PathSeparator : char
  static member ChangeExtension : path:string * extension:string -> string
  static member Combine : [<ParamArray>] paths:string[] -> string + 3 overloads
  static member GetDirectoryName : path:string -> string
  static member GetExtension : path:string -> string
  static member GetFileName : path:string -> string
  ...

Full name: System.IO.Path
Path.Combine([<ParamArray>] paths: string []) : string
Path.Combine(path1: string, path2: string) : string
Path.Combine(path1: string, path2: string, path3: string) : string
Path.Combine(path1: string, path2: string, path3: string, path4: string) : string
Path.GetDirectoryName(path: string) : string
Path.GetFullPath(path: string) : string
val endsOn : ext:string -> file:string -> bool

Full name: Script.JsRetriever.endsOn
val ext : string
val file : string
Path.GetExtension(path: string) : string
val getJsFiles : defaultAspxPath:string -> seq<string>

Full name: Script.JsRetriever.getJsFiles
val defaultAspxPath : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
val doc : HtmlDocument
Multiple items
type HtmlDocument =
  new : unit -> HtmlDocument
  val OptionAddDebuggingAttributes : bool
  val OptionAutoCloseOnEnd : bool
  val OptionCheckSyntax : bool
  val OptionComputeChecksum : bool
  val OptionDefaultStreamEncoding : Encoding
  val OptionExtractErrorSourceText : bool
  val OptionExtractErrorSourceTextMaxLength : int
  val OptionFixNestedTags : bool
  val OptionOutputAsXml : bool
  ...

Full name: HtmlAgilityPack.HtmlDocument

--------------------
HtmlDocument() : unit
HtmlDocument.Load(reader: TextReader) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, encoding: Text.Encoding) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string, encoding: Text.Encoding) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
   (+0 other overloads)
HtmlDocument.Load(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, buffersize: int) : unit
   (+0 other overloads)
property HtmlDocument.DocumentNode: HtmlNode
HtmlNode.SelectNodes(xpath: string) : HtmlNodeCollection
module Seq

from Microsoft.FSharp.Collections
val map : mapping:('T -> 'U) -> source:seq<'T> -> seq<'U>

Full name: Microsoft.FSharp.Collections.Seq.map
val x : HtmlNode
property HtmlNode.OuterHtml: string
Multiple items
module Choice

from FSharpx

--------------------
type Choice<'T1,'T2> =
  | Choice1Of2 of 'T1
  | Choice2Of2 of 'T2

Full name: Microsoft.FSharp.Core.Choice<_,_>

--------------------
type Choice<'T1,'T2,'T3> =
  | Choice1Of3 of 'T1
  | Choice2Of3 of 'T2
  | Choice3Of3 of 'T3

Full name: Microsoft.FSharp.Core.Choice<_,_,_>

--------------------
type Choice<'T1,'T2,'T3,'T4> =
  | Choice1Of4 of 'T1
  | Choice2Of4 of 'T2
  | Choice3Of4 of 'T3
  | Choice4Of4 of 'T4

Full name: Microsoft.FSharp.Core.Choice<_,_,_,_>

--------------------
type Choice<'T1,'T2,'T3,'T4,'T5> =
  | Choice1Of5 of 'T1
  | Choice2Of5 of 'T2
  | Choice3Of5 of 'T3
  | Choice4Of5 of 'T4
  | Choice5Of5 of 'T5

Full name: Microsoft.FSharp.Core.Choice<_,_,_,_,_>

--------------------
type Choice<'T1,'T2,'T3,'T4,'T5,'T6> =
  | Choice1Of6 of 'T1
  | Choice2Of6 of 'T2
  | Choice3Of6 of 'T3
  | Choice4Of6 of 'T4
  | Choice5Of6 of 'T5
  | Choice6Of6 of 'T6

Full name: Microsoft.FSharp.Core.Choice<_,_,_,_,_,_>

--------------------
type Choice<'T1,'T2,'T3,'T4,'T5,'T6,'T7> =
  | Choice1Of7 of 'T1
  | Choice2Of7 of 'T2
  | Choice3Of7 of 'T3
  | Choice4Of7 of 'T4
  | Choice5Of7 of 'T5
  | Choice6Of7 of 'T6
  | Choice7Of7 of 'T7

Full name: Microsoft.FSharp.Core.Choice<_,_,_,_,_,_,_>
val protect : f:('a -> 'b) -> x:'a -> Choice<'b,exn>

Full name: FSharpx.Choice.protect
val fold : folder:('State -> 'T -> 'State) -> state:'State -> source:seq<'T> -> 'State

Full name: Microsoft.FSharp.Collections.Seq.fold
val files : string list
val es : exn list
val choice : f1:('a -> 'b) -> f2:('c -> 'b) -> _arg1:Choice<'a,'c> -> 'b

Full name: FSharpx.Choice.choice
val f : string
val e : exn
val acc : string
val sprintf : format:Printf.StringFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.sprintf
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
val filter : predicate:('T -> bool) -> source:seq<'T> -> seq<'T>

Full name: Microsoft.FSharp.Collections.Seq.filter
Raw view Test code New version

More information

Link:http://fssnip.net/iS
Posted:10 years ago
Author:@kot_2010
Tags: html , parsing