21 people like it.

Web Crawler

This snippet features an F# Web crawler that i'm already using in 2 applications (slightly modified). It's based on a scalable network of communicating agents that follow URLs extracted from HTML pages until reaching the specified limit.

  1: 
  2: 
  3: 
  4: 
  5: 
  6: 
  7: 
  8: 
  9: 
 10: 
 11: 
 12: 
 13: 
 14: 
 15: 
 16: 
 17: 
 18: 
 19: 
 20: 
 21: 
 22: 
 23: 
 24: 
 25: 
 26: 
 27: 
 28: 
 29: 
 30: 
 31: 
 32: 
 33: 
 34: 
 35: 
 36: 
 37: 
 38: 
 39: 
 40: 
 41: 
 42: 
 43: 
 44: 
 45: 
 46: 
 47: 
 48: 
 49: 
 50: 
 51: 
 52: 
 53: 
 54: 
 55: 
 56: 
 57: 
 58: 
 59: 
 60: 
 61: 
 62: 
 63: 
 64: 
 65: 
 66: 
 67: 
 68: 
 69: 
 70: 
 71: 
 72: 
 73: 
 74: 
 75: 
 76: 
 77: 
 78: 
 79: 
 80: 
 81: 
 82: 
 83: 
 84: 
 85: 
 86: 
 87: 
 88: 
 89: 
 90: 
 91: 
 92: 
 93: 
 94: 
 95: 
 96: 
 97: 
 98: 
 99: 
100: 
101: 
102: 
103: 
104: 
105: 
106: 
107: 
108: 
109: 
110: 
111: 
112: 
113: 
114: 
115: 
116: 
117: 
118: 
119: 
120: 
121: 
122: 
123: 
124: 
125: 
126: 
127: 
128: 
129: 
130: 
131: 
132: 
133: 
134: 
135: 
136: 
137: 
138: 
139: 
140: 
141: 
142: 
143: 
144: 
145: 
146: 
147: 
148: 
149: 
150: 
151: 
152: 
153: 
154: 
155: 
156: 
157: 
158: 
159: 
160: 
161: 
162: 
163: 
164: 
165: 
166: 
167: 
168: 
169: 
170: 
171: 
172: 
173: 
174: 
175: 
176: 
177: 
178: 
179: 
180: 
181: 
182: 
183: 
184: 
185: 
186: 
187: 
188: 
189: 
190: 
191: 
192: 
193: 
194: 
open System
open System.Collections.Concurrent
open System.Collections.Generic
open System.IO
open System.Net
open System.Text.RegularExpressions

module Helpers =

    type Message =
        | Done
        | Mailbox of MailboxProcessor<Message>
        | Stop
        | Url of string option

    // Gates the number of crawling agents.
    [<Literal>]
    let Gate = 5

    // Extracts links from HTML.
    let extractLinks html =
        let pattern1 = "(?i)href\\s*=\\s*(\"|\')/?((?!#.*|/\B|mailto:|location\.|javascript:)[^\"\']+)(\"|\')"
        let pattern2 = "(?i)^https?"
 
        let links =
            [
                for x in Regex(pattern1).Matches(html) do
                    yield x.Groups.[2].Value
            ]
            |> List.filter (fun x -> Regex(pattern2).IsMatch(x))
        links
    
    // Fetches a Web page.
    let fetch (url : string) =
        try
            let req = WebRequest.Create(url) :?> HttpWebRequest
            req.UserAgent <- "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)"
            req.Timeout <- 5000
            use resp = req.GetResponse()
            let content = resp.ContentType
            let isHtml = Regex("html").IsMatch(content)
            match isHtml with
            | true -> use stream = resp.GetResponseStream()
                      use reader = new StreamReader(stream)
                      let html = reader.ReadToEnd()
                      Some html
            | false -> None
        with
        | _ -> None
    
    let collectLinks url =
        let html = fetch url
        match html with
        | Some x -> extractLinks x
        | None -> []

open Helpers

let crawl url limit =
    // Concurrent queue for saving collected urls.
    let q = ConcurrentQueue<string>()
    
    // Holds crawled URLs.
    let set = HashSet<string>()

    let supervisor =
        MailboxProcessor.Start(fun x ->
            let rec loop run =
                async {
                    let! msg = x.Receive()
                    match msg with
                    | Mailbox(mailbox) -> 
                        let count = set.Count
                        if count < limit - 1 && run then 
                            let url = q.TryDequeue()
                            match url with
                            | true, str -> if not (set.Contains str) then
                                                let set'= set.Add str
                                                mailbox.Post <| Url(Some str)
                                                return! loop run
                                            else
                                                mailbox.Post <| Url None
                                                return! loop run

                            | _ -> mailbox.Post <| Url None
                                   return! loop run
                        else
                            mailbox.Post Stop
                            return! loop run
                    | Stop -> return! loop false
                    | _ -> printfn "Supervisor is done."
                           (x :> IDisposable).Dispose()
                }
            loop true)

    
    let urlCollector =
        MailboxProcessor.Start(fun y ->
            let rec loop count =
                async {
                    let! msg = y.TryReceive(6000)
                    match msg with
                    | Some message ->
                        match message with
                        | Url u ->
                            match u with
                            | Some url -> q.Enqueue url
                                          return! loop count
                            | None -> return! loop count
                        | _ ->
                            match count with
                            | Gate -> supervisor.Post Done
                                      (y :> IDisposable).Dispose()
                                      printfn "URL collector is done."
                            | _ -> return! loop (count + 1)
                    | None -> supervisor.Post Stop
                              return! loop count
                }
            loop 1)
    
    /// Initializes a crawling agent.
    let crawler id =
        MailboxProcessor.Start(fun inbox ->
            let rec loop() =
                async {
                    let! msg = inbox.Receive()
                    match msg with
                    | Url x ->
                        match x with
                        | Some url -> 
                                let links = collectLinks url
                                printfn "%s crawled by agent %d." url id
                                for link in links do
                                    urlCollector.Post <| Url (Some link)
                                supervisor.Post(Mailbox(inbox))
                                return! loop()
                        | None -> supervisor.Post(Mailbox(inbox))
                                  return! loop()
                    | _ -> urlCollector.Post Done
                           printfn "Agent %d is done." id
                           (inbox :> IDisposable).Dispose()
                    }
            loop())

    // Spawn the crawlers.
    let crawlers = 
        [
            for i in 1 .. Gate do
                yield crawler i
        ]
    
    // Post the first messages.
    crawlers.Head.Post <| Url (Some url)
    crawlers.Tail |> List.iter (fun ag -> ag.Post <| Url None)

// Example:
crawl "http://news.google.com" 25

// Output:
// http://news.google.com crawled by agent 1.
// http://www.gstatic.com/news/img/favicon.ico crawled by agent 2.
// http://www.google.com/webhp?hl=en&tab=nw crawled by agent 5.
// http://www.google.com/imghp?hl=en&tab=ni crawled by agent 3.
// http://video.google.com/?hl=en&tab=nv crawled by agent 4.
// http://www.google.com/prdhp?hl=en&tab=nf crawled by agent 5.
// http://news.google.com/news?pz=1&amp;cf=all&amp;ned=us&amp;hl=en
// &amp;topic=h&amp;num=3&amp;output=rss crawled by agent 1.
// http://www.google.com/intl/en/options/ crawled by agent 4.
// http://maps.google.com/maps?hl=en&tab=nl crawled by agent 2.
// http://www.google.com/finance?hl=en&tab=ne crawled by agent 1.
// http://scholar.google.com/schhp?hl=en&tab=ns crawled by agent 2.
// http://www.google.com/realtime?hl=en&tab=nY crawled by agent 2.
// http://mail.google.com/mail/?tab=nm crawled by agent 3.
// http://books.google.com/bkshp?hl=en&tab=np crawled by agent 5.
// http://translate.google.com/?hl=en&tab=nT crawled by agent 4.
// http://blogsearch.google.com/?hl=en&tab=nb crawled by agent 1.
// http://www.google.com/calendar?hl=en&tab=nc crawled by agent 3.
// http://picasaweb.google.com/home?hl=en&tab=nq crawled by agent 5.
// http://www.google.com/reader/?tab=ny crawled by agent 1.
// http://docs.google.com/?tab=no crawled by agent 4.
// https://www.google.com/accounts/ServiceLogin?service=news&amp;pas
// sive=1209600&amp;continue=http://news.google.com/&amp;followup=htt
// p://news.google.com/ crawled by agent 1.
// Agent 1 is done.
// http://www.google.com/preferences?hl=en&prev=http://news.google.com/ crawled by agent 4.
// Agent 4 is done.
// http://sites.google.com/?tab=n3 crawled by agent 3.
// Agent 3 is done.
// http://www.youtube.com/?hl=en&tab=n1 crawled by agent 2.
// Agent 2 is done.
// http://groups.google.com/grphp?hl=en&tab=ng crawled by agent 5.
// Agent 5 is done.
// URL collector is done.
// Supervisor is done.
namespace System
namespace System.Collections
namespace System.Collections.Concurrent
namespace System.Collections.Generic
namespace System.IO
namespace System.Net
namespace System.Text
namespace System.Text.RegularExpressions
type Message =
  | Done
  | Mailbox of MailboxProcessor<Message>
  | Stop
  | Url of string option

Full name: Script.Helpers.Message
union case Message.Done: Message
union case Message.Mailbox: MailboxProcessor<Message> -> Message
Multiple items
type MailboxProcessor<'Msg> =
  interface IDisposable
  new : body:(MailboxProcessor<'Msg> -> Async<unit>) * ?cancellationToken:CancellationToken -> MailboxProcessor<'Msg>
  member Post : message:'Msg -> unit
  member PostAndAsyncReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> Async<'Reply>
  member PostAndReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> 'Reply
  member PostAndTryAsyncReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> Async<'Reply option>
  member Receive : ?timeout:int -> Async<'Msg>
  member Scan : scanner:('Msg -> Async<'T> option) * ?timeout:int -> Async<'T>
  member Start : unit -> unit
  member TryPostAndReply : buildMessage:(AsyncReplyChannel<'Reply> -> 'Msg) * ?timeout:int -> 'Reply option
  ...

Full name: Microsoft.FSharp.Control.MailboxProcessor<_>

--------------------
new : body:(MailboxProcessor<'Msg> -> Async<unit>) * ?cancellationToken:Threading.CancellationToken -> MailboxProcessor<'Msg>
union case Message.Stop: Message
union case Message.Url: string option -> Message
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
type 'T option = Option<'T>

Full name: Microsoft.FSharp.Core.option<_>
Multiple items
type LiteralAttribute =
  inherit Attribute
  new : unit -> LiteralAttribute

Full name: Microsoft.FSharp.Core.LiteralAttribute

--------------------
new : unit -> LiteralAttribute
val Gate : int

Full name: Script.Helpers.Gate
val extractLinks : html:string -> string list

Full name: Script.Helpers.extractLinks
val html : string
val pattern1 : string
val pattern2 : string
val links : string list
val x : Match
Multiple items
type Regex =
  new : pattern:string -> Regex + 1 overload
  member GetGroupNames : unit -> string[]
  member GetGroupNumbers : unit -> int[]
  member GroupNameFromNumber : i:int -> string
  member GroupNumberFromName : name:string -> int
  member IsMatch : input:string -> bool + 1 overload
  member Match : input:string -> Match + 2 overloads
  member Matches : input:string -> MatchCollection + 1 overload
  member Options : RegexOptions
  member Replace : input:string * replacement:string -> string + 5 overloads
  ...

Full name: System.Text.RegularExpressions.Regex

--------------------
Regex(pattern: string) : unit
Regex(pattern: string, options: RegexOptions) : unit
property Match.Groups: GroupCollection
Multiple items
type List<'T> =
  new : unit -> List<'T> + 2 overloads
  member Add : item:'T -> unit
  member AddRange : collection:IEnumerable<'T> -> unit
  member AsReadOnly : unit -> ReadOnlyCollection<'T>
  member BinarySearch : item:'T -> int + 2 overloads
  member Capacity : int with get, set
  member Clear : unit -> unit
  member Contains : item:'T -> bool
  member ConvertAll<'TOutput> : converter:Converter<'T, 'TOutput> -> List<'TOutput>
  member CopyTo : array:'T[] -> unit + 2 overloads
  ...
  nested type Enumerator

Full name: System.Collections.Generic.List<_>

--------------------
List() : unit
List(capacity: int) : unit
List(collection: IEnumerable<'T>) : unit
val filter : predicate:('T -> bool) -> list:'T list -> 'T list

Full name: Microsoft.FSharp.Collections.List.filter
val x : string
val fetch : url:string -> string option

Full name: Script.Helpers.fetch
val url : string
val req : HttpWebRequest
type WebRequest =
  inherit MarshalByRefObject
  member Abort : unit -> unit
  member AuthenticationLevel : AuthenticationLevel with get, set
  member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
  member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
  member CachePolicy : RequestCachePolicy with get, set
  member ConnectionGroupName : string with get, set
  member ContentLength : int64 with get, set
  member ContentType : string with get, set
  member Credentials : ICredentials with get, set
  member EndGetRequestStream : asyncResult:IAsyncResult -> Stream
  ...

Full name: System.Net.WebRequest
WebRequest.Create(requestUri: Uri) : WebRequest
WebRequest.Create(requestUriString: string) : WebRequest
type HttpWebRequest =
  inherit WebRequest
  member Abort : unit -> unit
  member Accept : string with get, set
  member AddRange : range:int -> unit + 7 overloads
  member Address : Uri
  member AllowAutoRedirect : bool with get, set
  member AllowWriteStreamBuffering : bool with get, set
  member AutomaticDecompression : DecompressionMethods with get, set
  member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
  member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
  member ClientCertificates : X509CertificateCollection with get, set
  ...

Full name: System.Net.HttpWebRequest
property HttpWebRequest.UserAgent: string
property HttpWebRequest.Timeout: int
val resp : WebResponse
HttpWebRequest.GetResponse() : WebResponse
val content : string
property WebResponse.ContentType: string
val isHtml : bool
val stream : Stream
WebResponse.GetResponseStream() : Stream
val reader : StreamReader
Multiple items
type StreamReader =
  inherit TextReader
  new : stream:Stream -> StreamReader + 9 overloads
  member BaseStream : Stream
  member Close : unit -> unit
  member CurrentEncoding : Encoding
  member DiscardBufferedData : unit -> unit
  member EndOfStream : bool
  member Peek : unit -> int
  member Read : unit -> int + 1 overload
  member ReadLine : unit -> string
  member ReadToEnd : unit -> string
  ...

Full name: System.IO.StreamReader

--------------------
StreamReader(stream: Stream) : unit
StreamReader(path: string) : unit
StreamReader(stream: Stream, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(stream: Stream, encoding: Text.Encoding) : unit
StreamReader(path: string, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(path: string, encoding: Text.Encoding) : unit
StreamReader(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
StreamReader.ReadToEnd() : string
union case Option.Some: Value: 'T -> Option<'T>
union case Option.None: Option<'T>
val collectLinks : url:string -> string list

Full name: Script.Helpers.collectLinks
val html : string option
module Helpers

from Script
val crawl : url:string -> limit:int -> unit

Full name: Script.crawl
val limit : int
val q : ConcurrentQueue<string>
Multiple items
type ConcurrentQueue<'T> =
  new : unit -> ConcurrentQueue<'T> + 1 overload
  member CopyTo : array:'T[] * index:int -> unit
  member Count : int
  member Enqueue : item:'T -> unit
  member GetEnumerator : unit -> IEnumerator<'T>
  member IsEmpty : bool
  member ToArray : unit -> 'T[]
  member TryDequeue : result:'T -> bool
  member TryPeek : result:'T -> bool

Full name: System.Collections.Concurrent.ConcurrentQueue<_>

--------------------
ConcurrentQueue() : unit
ConcurrentQueue(collection: IEnumerable<'T>) : unit
val set : HashSet<string>
Multiple items
type HashSet<'T> =
  new : unit -> HashSet<'T> + 3 overloads
  member Add : item:'T -> bool
  member Clear : unit -> unit
  member Comparer : IEqualityComparer<'T>
  member Contains : item:'T -> bool
  member CopyTo : array:'T[] -> unit + 2 overloads
  member Count : int
  member ExceptWith : other:IEnumerable<'T> -> unit
  member GetEnumerator : unit -> Enumerator<'T>
  member GetObjectData : info:SerializationInfo * context:StreamingContext -> unit
  ...
  nested type Enumerator

Full name: System.Collections.Generic.HashSet<_>

--------------------
HashSet() : unit
HashSet(comparer: IEqualityComparer<'T>) : unit
HashSet(collection: IEnumerable<'T>) : unit
HashSet(collection: IEnumerable<'T>, comparer: IEqualityComparer<'T>) : unit
val supervisor : MailboxProcessor<Message>
static member MailboxProcessor.Start : body:(MailboxProcessor<'Msg> -> Async<unit>) * ?cancellationToken:Threading.CancellationToken -> MailboxProcessor<'Msg>
val x : MailboxProcessor<Message>
val loop : (bool -> Async<unit>)
val run : bool
val async : AsyncBuilder

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.async
val msg : Message
member MailboxProcessor.Receive : ?timeout:int -> Async<'Msg>
val mailbox : MailboxProcessor<Message>
val count : int
property HashSet.Count: int
val url : bool * string
ConcurrentQueue.TryDequeue(result: byref<string>) : bool
val str : string
val not : value:bool -> bool

Full name: Microsoft.FSharp.Core.Operators.not
HashSet.Contains(item: string) : bool
val set' : bool
HashSet.Add(item: string) : bool
member MailboxProcessor.Post : message:'Msg -> unit
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
type IDisposable =
  member Dispose : unit -> unit

Full name: System.IDisposable
val urlCollector : MailboxProcessor<Message>
val y : MailboxProcessor<Message>
val loop : (int -> Async<unit>)
val msg : Message option
member MailboxProcessor.TryReceive : ?timeout:int -> Async<'Msg option>
val message : Message
val u : string option
ConcurrentQueue.Enqueue(item: string) : unit
val crawler : (int -> MailboxProcessor<Message>)


 Initializes a crawling agent.
val id : int
val inbox : MailboxProcessor<Message>
val loop : (unit -> Async<unit>)
val x : string option
val link : string
val crawlers : MailboxProcessor<Message> list
val i : int
property List.Head: MailboxProcessor<Message>
property List.Tail: MailboxProcessor<Message> list
val iter : action:('T -> unit) -> list:'T list -> unit

Full name: Microsoft.FSharp.Collections.List.iter
val ag : MailboxProcessor<Message>
Raw view New version

More information

Link:http://fssnip.net/3K
Posted:4 years ago
Author:Taha Hachana
Tags: Web , Crawler , Agent , MailboxProcessor , Regex , HTML