using System; using System.IO; using System.Net; using System.Security.Cryptography; using System.Text; using Microsoft.Office.Server.Search.Administration; namespace CrawlLogExporter { /// /// The worlds smallest connector for pushing sharepoint crawl data to elasticsearch. /// class Program { static void Main(string[] args) { var appid = new Guid(args[0]); var contentsourcename = args[1]; SearchService searchService = SearchService.Service; SearchServiceApplication searchApp = searchService.SearchApplications.GetValue(appid ); System.Console.Out.WriteLine("Found Search Service Application"); LogViewer viewer = new LogViewer(searchApp); viewer.MaxDaysCrawlLogged = 20; System.Console.Out.WriteLine("Getting all status messages"); var msgs = viewer.GetAllStatusMessages(); System.Console.Out.WriteLine("Got'em all!"); Content con = new Content(searchApp); var sources = con.ContentSources; int sourceid = 0; foreach (var source in sources) { var cs = (ContentSource)source; Console.Out.WriteLine(cs.Name + " Warns: " + cs.WarningCount + " Errors: " + cs.ErrorCount + " , id: " + cs.Id); if (cs.Name.Equals(contentsourcename)) { sourceid = cs.Id; } } foreach (var errmsg in msgs.Select()) { int errorId = Convert.ToInt32(errmsg.ItemArray[0].ToString()); // 0 = OK , 1 = deletes //if (errorId == 0 || errorId == 1) continue; System.Console.Out.WriteLine("Working with errorId " + errorId); string errormsg = errmsg.ItemArray[1].ToString(); var crawlfilters = new CrawlLogFilters(); crawlfilters.AddFilter(CrawlLogFilterProperty.ContentSourceId, sourceid); crawlfilters.AddFilter(CrawlLogFilterProperty.MessageId, errorId); //crawlfilters.AddFilter(DateTime.Now.AddDays(-1), DateTime.Now); int nextstart = 0; int batchsize = 100000; while (nextstart != -1) { crawlfilters.AddFilter(CrawlLogFilterProperty.StartAt, nextstart); crawlfilters.AddFilter(CrawlLogFilterProperty.TotalEntries, batchsize); Console.Out.WriteLine(string.Format("Getting {0} entries starting from {1}", batchsize, nextstart)); var urls = viewer.GetCurrentCrawlLogData(crawlfilters, out nextstart); foreach (var url in urls.Select()) { var entry = new CrawlLogEntry { ContentSource = contentsourcename, Url = url.ItemArray[0].ToString(), ErrorId = errorId, ErrorMessage = url.ItemArray[2].ToString(), ErrorDescription = url.ItemArray[4].ToString(), Date = ((DateTime) url.ItemArray[6]).ToString("yyyy-MM-dd HH:mm:ss") }; //System.Console.Out.WriteLine("Working on: " + entry.Url); var jsonSerializer = new System.Web.Script.Serialization.JavaScriptSerializer(); string json = jsonSerializer.Serialize(entry); var httpWebRequest = (HttpWebRequest) WebRequest.Create("http://localhost:9200/crawllog/log/" + GetHashId(entry)); // hash = md5(date + url) httpWebRequest.ContentType = "text/json"; httpWebRequest.Method = "POST"; using (var streamWriter = new StreamWriter(httpWebRequest.GetRequestStream())) { streamWriter.Write(json); streamWriter.Flush(); streamWriter.Close(); } var httpResponse = (HttpWebResponse) httpWebRequest.GetResponse(); using (var streamReader = new StreamReader(httpResponse.GetResponseStream())) { var result = streamReader.ReadToEnd(); //System.Console.Out.WriteLine(httpResponse.StatusDescription); } } } } } private static string GetHashId(CrawlLogEntry entry) { string source = entry.Date+entry.Url; byte[] tmpSource = Encoding.ASCII.GetBytes(source); byte[] tmpHash = new MD5CryptoServiceProvider().ComputeHash(tmpSource); int i; var sOutput = new StringBuilder(tmpHash.Length); for (i = 0; i < tmpHash.Length; i++) { sOutput.Append(tmpHash[i].ToString("X2")); } return sOutput.ToString(); } } /// /// Simple DTO file to save Crawl Log Entries before /// pushing them to elasticsearch. /// public class CrawlLogEntry { public string Date { get; set; } public int ErrorId { get; set; } public string ErrorMessage { get; set; } public string ErrorDescription { get; set; } public string Url { get; set; } public string ContentSource { get; set; } } }