2 people like it.

StackOverflowCrawler

Tries to find the best technologies from stackoverflow. Don't use too wide tags (with many thousand request) or firewall will block you!

 1: 
 2: 
 3: 
 4: 
 5: 
 6: 
 7: 
 8: 
 9: 
10: 
11: 
12: 
13: 
14: 
15: 
16: 
17: 
18: 
19: 
20: 
21: 
22: 
23: 
24: 
25: 
26: 
27: 
28: 
29: 
30: 
31: 
32: 
33: 
34: 
35: 
36: 
37: 
38: 
39: 
40: 
41: 
42: 
43: 
44: 
45: 
46: 
47: 
48: 
49: 
50: 
51: 
52: 
53: 
54: 
55: 
56: 
57: 
58: 
59: 
60: 
61: 
62: 
63: 
64: 
65: 
66: 
67: 
68: 
69: 
module StackOverflowCrawler
open System      // Mennään .NET-perus-stäkillä.
open System.Net  // async-webrequest-versio helppo tehdä tällä: http://fsharppowerpack.codeplex.com/
open System.IO   // string-parsinta kannattaisi tehdä tällä: http://htmlagilitypack.codeplex.com/ 
open System.Web

let fetch (url : Uri) = 
    let req = WebRequest.Create (url) :?> HttpWebRequest    
    use stream = req.GetResponse().GetResponseStream()
    use reader = new StreamReader(stream)
    reader.ReadToEnd()

let makeUrl pagetype (tags:string) = 
    new Uri("http://stackoverflow.com/" + pagetype + "/tagged/" + HttpUtility.UrlEncode(tags))
let questions, unanswered = makeUrl "questions", makeUrl "unanswered"

let sumcount (fetched:string) = 
    let startpos = (fetched.IndexOf "<div class=\"summarycount al\">")+29
    let endpos = fetched.IndexOf("</div>",startpos)
    fetched.Substring(startpos,endpos-startpos).Replace(",","") |> Double.Parse

let relatedtags (basetag:string) (fetched:string) = //lisää parsintaa...
    let rec relativepositions (links:string) (found:string list) = 
        let startpos = links.IndexOf("/questions/tagged/" + basetag + "+")
        let realpos = startpos + 19 + basetag.Length
        let endpos = links.IndexOf("\"", realpos)
        let tag = links.Substring(realpos,endpos-realpos)
        match startpos with -1 -> found | _ -> tag :: relativepositions (links.Substring realpos) found    
    relativepositions fetched [] 

type surfmode = Inclusive | Exclusive

let checktag (sm:surfmode) basetag = 
    let acceptRate, minCount = 0.02, 1000.0;
    let add = match sm with Inclusive -> "+" | Exclusive -> "+-"
    let rec surf (tags:string) (tagsToSurf:string list) =
        let fetchTotalPage = tags |> (questions >> fetch) 
        let taggedQuestions = fetchTotalPage |> sumcount
        if taggedQuestions >= minCount then
            let unasweredWithTag = unanswered >> fetch >> sumcount
            let ratio = (unasweredWithTag tags) / taggedQuestions
            let surfTheRestOfTree rest = 
                let test tag = surf (tags + add + tag) []
                List.iter test rest
            do printfn "Ratio %f and count %g with tags %s" ratio taggedQuestions tags
            match ratio with
            | r when r <= acceptRate -> do printfn "--- Accepted: %s ---" basetag
            | _ -> 
                match relatedtags tags fetchTotalPage with
                | first::rest -> 
                    do printfn "Failed. Trying %d related..." rest.Length
                    surf (tags + add + first) rest
                    surfTheRestOfTree rest
                | _ -> surfTheRestOfTree tagsToSurf

    surf (basetag.ToLower()) []
    do printfn "Everything checked."

//Interactive tests:
//questions "java"
//questions "F#"
//unanswered "java"
//let fetched = questions "java" |> fetch
//fetched |> sumcount
//fetched |> relatedtags "java"
//checktag surfmode.Exclusive "F#"
//checktag surfmode.Inclusive "flash+flex"
//checktag surfmode.Exclusive "flash+flex"
//checktag surfmode.Exclusive "java" //jauhaa ikuisuuden eikä löydä mitään?
module StackOverflowCrawler
namespace System
namespace System.Net
namespace System.IO
namespace System.Web
val fetch : url:Uri -> string

Full name: StackOverflowCrawler.fetch
val url : Uri
Multiple items
type Uri =
  new : uriString:string -> Uri + 5 overloads
  member AbsolutePath : string
  member AbsoluteUri : string
  member Authority : string
  member DnsSafeHost : string
  member Equals : comparand:obj -> bool
  member Fragment : string
  member GetComponents : components:UriComponents * format:UriFormat -> string
  member GetHashCode : unit -> int
  member GetLeftPart : part:UriPartial -> string
  ...

Full name: System.Uri

--------------------
Uri(uriString: string) : unit
Uri(uriString: string, uriKind: UriKind) : unit
Uri(baseUri: Uri, relativeUri: string) : unit
Uri(baseUri: Uri, relativeUri: Uri) : unit
val req : HttpWebRequest
type WebRequest =
  inherit MarshalByRefObject
  member Abort : unit -> unit
  member AuthenticationLevel : AuthenticationLevel with get, set
  member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
  member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
  member CachePolicy : RequestCachePolicy with get, set
  member ConnectionGroupName : string with get, set
  member ContentLength : int64 with get, set
  member ContentType : string with get, set
  member Credentials : ICredentials with get, set
  member EndGetRequestStream : asyncResult:IAsyncResult -> Stream
  ...

Full name: System.Net.WebRequest
WebRequest.Create(requestUri: Uri) : WebRequest
WebRequest.Create(requestUriString: string) : WebRequest
type HttpWebRequest =
  inherit WebRequest
  member Abort : unit -> unit
  member Accept : string with get, set
  member AddRange : range:int -> unit + 7 overloads
  member Address : Uri
  member AllowAutoRedirect : bool with get, set
  member AllowWriteStreamBuffering : bool with get, set
  member AutomaticDecompression : DecompressionMethods with get, set
  member BeginGetRequestStream : callback:AsyncCallback * state:obj -> IAsyncResult
  member BeginGetResponse : callback:AsyncCallback * state:obj -> IAsyncResult
  member ClientCertificates : X509CertificateCollection with get, set
  ...

Full name: System.Net.HttpWebRequest
val stream : Stream
HttpWebRequest.GetResponse() : WebResponse
val reader : StreamReader
Multiple items
type StreamReader =
  inherit TextReader
  new : stream:Stream -> StreamReader + 9 overloads
  member BaseStream : Stream
  member Close : unit -> unit
  member CurrentEncoding : Encoding
  member DiscardBufferedData : unit -> unit
  member EndOfStream : bool
  member Peek : unit -> int
  member Read : unit -> int + 1 overload
  member ReadLine : unit -> string
  member ReadToEnd : unit -> string
  ...

Full name: System.IO.StreamReader

--------------------
StreamReader(stream: Stream) : unit
StreamReader(path: string) : unit
StreamReader(stream: Stream, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(stream: Stream, encoding: Text.Encoding) : unit
StreamReader(path: string, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(path: string, encoding: Text.Encoding) : unit
StreamReader(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool) : unit
StreamReader(stream: Stream, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
StreamReader(path: string, encoding: Text.Encoding, detectEncodingFromByteOrderMarks: bool, bufferSize: int) : unit
StreamReader.ReadToEnd() : string
val makeUrl : pagetype:string -> tags:string -> Uri

Full name: StackOverflowCrawler.makeUrl
val pagetype : string
val tags : string
Multiple items
val string : value:'T -> string

Full name: Microsoft.FSharp.Core.Operators.string

--------------------
type string = String

Full name: Microsoft.FSharp.Core.string
Multiple items
type HttpUtility =
  new : unit -> HttpUtility
  static member HtmlAttributeEncode : s:string -> string + 1 overload
  static member HtmlDecode : s:string -> string + 1 overload
  static member HtmlEncode : s:string -> string + 2 overloads
  static member JavaScriptStringEncode : value:string -> string + 1 overload
  static member ParseQueryString : query:string -> NameValueCollection + 1 overload
  static member UrlDecode : str:string -> string + 3 overloads
  static member UrlDecodeToBytes : str:string -> byte[] + 3 overloads
  static member UrlEncode : str:string -> string + 3 overloads
  static member UrlEncodeToBytes : str:string -> byte[] + 3 overloads
  ...

Full name: System.Web.HttpUtility

--------------------
HttpUtility() : unit
HttpUtility.UrlEncode(bytes: byte []) : string
HttpUtility.UrlEncode(str: string) : string
HttpUtility.UrlEncode(str: string, e: Text.Encoding) : string
HttpUtility.UrlEncode(bytes: byte [], offset: int, count: int) : string
val questions : (string -> Uri)

Full name: StackOverflowCrawler.questions
val unanswered : (string -> Uri)

Full name: StackOverflowCrawler.unanswered
val sumcount : fetched:string -> float

Full name: StackOverflowCrawler.sumcount
val fetched : string
val startpos : int
String.IndexOf(value: string) : int
String.IndexOf(value: char) : int
String.IndexOf(value: string, comparisonType: StringComparison) : int
String.IndexOf(value: string, startIndex: int) : int
String.IndexOf(value: char, startIndex: int) : int
String.IndexOf(value: string, startIndex: int, comparisonType: StringComparison) : int
String.IndexOf(value: string, startIndex: int, count: int) : int
String.IndexOf(value: char, startIndex: int, count: int) : int
String.IndexOf(value: string, startIndex: int, count: int, comparisonType: StringComparison) : int
val endpos : int
String.Substring(startIndex: int) : string
String.Substring(startIndex: int, length: int) : string
type Double =
  struct
    member CompareTo : value:obj -> int + 1 overload
    member Equals : obj:obj -> bool + 1 overload
    member GetHashCode : unit -> int
    member GetTypeCode : unit -> TypeCode
    member ToString : unit -> string + 3 overloads
    static val MinValue : float
    static val MaxValue : float
    static val Epsilon : float
    static val NegativeInfinity : float
    static val PositiveInfinity : float
    ...
  end

Full name: System.Double
Double.Parse(s: string) : float
Double.Parse(s: string, provider: IFormatProvider) : float
Double.Parse(s: string, style: Globalization.NumberStyles) : float
Double.Parse(s: string, style: Globalization.NumberStyles, provider: IFormatProvider) : float
val relatedtags : basetag:string -> fetched:string -> string list

Full name: StackOverflowCrawler.relatedtags
val basetag : string
val relativepositions : (string -> string list -> string list)
val links : string
val found : string list
type 'T list = List<'T>

Full name: Microsoft.FSharp.Collections.list<_>
val realpos : int
property String.Length: int
val tag : string
type surfmode =
  | Inclusive
  | Exclusive

Full name: StackOverflowCrawler.surfmode
union case surfmode.Inclusive: surfmode
union case surfmode.Exclusive: surfmode
val checktag : sm:surfmode -> basetag:string -> unit

Full name: StackOverflowCrawler.checktag
val sm : surfmode
val acceptRate : float
val minCount : float
val add : string
val surf : (string -> string list -> unit)
val tagsToSurf : string list
val fetchTotalPage : string
val taggedQuestions : float
val unasweredWithTag : (string -> float)
val ratio : float
val surfTheRestOfTree : (string list -> unit)
val rest : string list
val test : (string -> unit)
Multiple items
module List

from Microsoft.FSharp.Collections

--------------------
type List<'T> =
  | ( [] )
  | ( :: ) of Head: 'T * Tail: 'T list
  interface IEnumerable
  interface IEnumerable<'T>
  member Head : 'T
  member IsEmpty : bool
  member Item : index:int -> 'T with get
  member Length : int
  member Tail : 'T list
  static member Cons : head:'T * tail:'T list -> 'T list
  static member Empty : 'T list

Full name: Microsoft.FSharp.Collections.List<_>
val iter : action:('T -> unit) -> list:'T list -> unit

Full name: Microsoft.FSharp.Collections.List.iter
val printfn : format:Printf.TextWriterFormat<'T> -> 'T

Full name: Microsoft.FSharp.Core.ExtraTopLevelOperators.printfn
val r : float
val first : string
property List.Length: int
String.ToLower() : string
String.ToLower(culture: Globalization.CultureInfo) : string

More information

Link:http://fssnip.net/a9
Posted:12 years ago
Author:
Tags: webcrawling