module StackOverflowCrawler open System // Mennään .NET-perus-stäkillä. open System.Net // async-webrequest-versio helppo tehdä tällä: http://fsharppowerpack.codeplex.com/ open System.IO // string-parsinta kannattaisi tehdä tällä: http://htmlagilitypack.codeplex.com/ open System.Web let fetch (url : Uri) = let req = WebRequest.Create (url) :?> HttpWebRequest use stream = req.GetResponse().GetResponseStream() use reader = new StreamReader(stream) reader.ReadToEnd() let makeUrl pagetype (tags:string) = new Uri("http://stackoverflow.com/" + pagetype + "/tagged/" + HttpUtility.UrlEncode(tags)) let questions, unanswered = makeUrl "questions", makeUrl "unanswered" let sumcount (fetched:string) = let startpos = (fetched.IndexOf "
")+29 let endpos = fetched.IndexOf("
",startpos) fetched.Substring(startpos,endpos-startpos).Replace(",","") |> Double.Parse let relatedtags (basetag:string) (fetched:string) = //lisää parsintaa... let rec relativepositions (links:string) (found:string list) = let startpos = links.IndexOf("/questions/tagged/" + basetag + "+") let realpos = startpos + 19 + basetag.Length let endpos = links.IndexOf("\"", realpos) let tag = links.Substring(realpos,endpos-realpos) match startpos with -1 -> found | _ -> tag :: relativepositions (links.Substring realpos) found relativepositions fetched [] type surfmode = Inclusive | Exclusive let checktag (sm:surfmode) basetag = let acceptRate, minCount = 0.02, 1000.0; let add = match sm with Inclusive -> "+" | Exclusive -> "+-" let rec surf (tags:string) (tagsToSurf:string list) = let fetchTotalPage = tags |> (questions >> fetch) let taggedQuestions = fetchTotalPage |> sumcount if taggedQuestions >= minCount then let unasweredWithTag = unanswered >> fetch >> sumcount let ratio = (unasweredWithTag tags) / taggedQuestions let surfTheRestOfTree rest = let test tag = surf (tags + add + tag) [] List.iter test rest do printfn "Ratio %f and count %g with tags %s" ratio taggedQuestions tags match ratio with | r when r <= acceptRate -> do printfn "--- Accepted: %s ---" basetag | _ -> match relatedtags tags fetchTotalPage with | first::rest -> do printfn "Failed. Trying %d related..." rest.Length surf (tags + add + first) rest surfTheRestOfTree rest | _ -> surfTheRestOfTree tagsToSurf surf (basetag.ToLower()) [] do printfn "Everything checked." //Interactive tests: //questions "java" //questions "F#" //unanswered "java" //let fetched = questions "java" |> fetch //fetched |> sumcount //fetched |> relatedtags "java" //checktag surfmode.Exclusive "F#" //checktag surfmode.Inclusive "flash+flex" //checktag surfmode.Exclusive "flash+flex" //checktag surfmode.Exclusive "java" //jauhaa ikuisuuden eikä löydä mitään?