using System;
using System.IO;
using System.Net;
using System.Security.Cryptography;
using System.Text;
using Microsoft.Office.Server.Search.Administration;
namespace CrawlLogExporter
{
///
/// The worlds smallest connector for pushing sharepoint crawl data to elasticsearch.
///
class Program
{
static void Main(string[] args)
{
var appid = new Guid(args[0]);
var contentsourcename = args[1];
SearchService searchService = SearchService.Service;
SearchServiceApplication searchApp =
searchService.SearchApplications.GetValue(appid );
System.Console.Out.WriteLine("Found Search Service Application");
LogViewer viewer = new LogViewer(searchApp);
viewer.MaxDaysCrawlLogged = 20;
System.Console.Out.WriteLine("Getting all status messages");
var msgs = viewer.GetAllStatusMessages();
System.Console.Out.WriteLine("Got'em all!");
Content con = new Content(searchApp);
var sources = con.ContentSources;
int sourceid = 0;
foreach (var source in sources)
{
var cs = (ContentSource)source;
Console.Out.WriteLine(cs.Name + " Warns: " + cs.WarningCount + " Errors: " + cs.ErrorCount + " , id: " + cs.Id);
if (cs.Name.Equals(contentsourcename))
{
sourceid = cs.Id;
}
}
foreach (var errmsg in msgs.Select())
{
int errorId = Convert.ToInt32(errmsg.ItemArray[0].ToString());
// 0 = OK , 1 = deletes
//if (errorId == 0 || errorId == 1) continue;
System.Console.Out.WriteLine("Working with errorId " + errorId);
string errormsg = errmsg.ItemArray[1].ToString();
var crawlfilters = new CrawlLogFilters();
crawlfilters.AddFilter(CrawlLogFilterProperty.ContentSourceId, sourceid);
crawlfilters.AddFilter(CrawlLogFilterProperty.MessageId, errorId);
//crawlfilters.AddFilter(DateTime.Now.AddDays(-1), DateTime.Now);
int nextstart = 0;
int batchsize = 100000;
while (nextstart != -1)
{
crawlfilters.AddFilter(CrawlLogFilterProperty.StartAt, nextstart);
crawlfilters.AddFilter(CrawlLogFilterProperty.TotalEntries, batchsize);
Console.Out.WriteLine(string.Format("Getting {0} entries starting from {1}", batchsize, nextstart));
var urls = viewer.GetCurrentCrawlLogData(crawlfilters, out nextstart);
foreach (var url in urls.Select())
{
var entry = new CrawlLogEntry
{
ContentSource = contentsourcename,
Url = url.ItemArray[0].ToString(),
ErrorId = errorId,
ErrorMessage = url.ItemArray[2].ToString(),
ErrorDescription = url.ItemArray[4].ToString(),
Date = ((DateTime) url.ItemArray[6]).ToString("yyyy-MM-dd HH:mm:ss")
};
//System.Console.Out.WriteLine("Working on: " + entry.Url);
var jsonSerializer = new System.Web.Script.Serialization.JavaScriptSerializer();
string json = jsonSerializer.Serialize(entry);
var httpWebRequest =
(HttpWebRequest) WebRequest.Create("http://localhost:9200/crawllog/log/" + GetHashId(entry)); // hash = md5(date + url)
httpWebRequest.ContentType = "text/json";
httpWebRequest.Method = "POST";
using (var streamWriter = new StreamWriter(httpWebRequest.GetRequestStream()))
{
streamWriter.Write(json);
streamWriter.Flush();
streamWriter.Close();
}
var httpResponse = (HttpWebResponse) httpWebRequest.GetResponse();
using (var streamReader = new StreamReader(httpResponse.GetResponseStream()))
{
var result = streamReader.ReadToEnd();
//System.Console.Out.WriteLine(httpResponse.StatusDescription);
}
}
}
}
}
private static string GetHashId(CrawlLogEntry entry)
{
string source = entry.Date+entry.Url;
byte[] tmpSource = Encoding.ASCII.GetBytes(source);
byte[] tmpHash = new MD5CryptoServiceProvider().ComputeHash(tmpSource);
int i;
var sOutput = new StringBuilder(tmpHash.Length);
for (i = 0; i < tmpHash.Length; i++)
{
sOutput.Append(tmpHash[i].ToString("X2"));
}
return sOutput.ToString();
}
}
///
/// Simple DTO file to save Crawl Log Entries before
/// pushing them to elasticsearch.
///
public class CrawlLogEntry
{
public string Date { get; set; }
public int ErrorId { get; set; }
public string ErrorMessage { get; set; }
public string ErrorDescription { get; set; }
public string Url { get; set; }
public string ContentSource { get; set; }
}
}