using System;
using System.IO;
using System.Net;
using System.Text;
namespace DomenicDenicola
{
///
/// Provides methods for performing web requests.
///
public class WebCrawler
{
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Performance", "CA1810:InitializeReferenceTypeStaticFieldsInline", Justification = "This static constructor performs an operation that must take place before any use of WebCrawler, and is not simply field initialization.")]
static WebCrawler()
{
// This makes it unconditionally accept all server certifications.
ServicePointManager.ServerCertificateValidationCallback += (sender, cert, chain, policyErrors) => true;
}
public WebCrawler()
{
this.Cookies = new CookieContainer();
this.UserAgent = WebCrawler.DefaultUserAgent;
}
#region Web-navigating methods
///
/// Sends a web request to the specified URI and returns the resulting page contents.
///
/// A containing the absolute URI of the page to request.
/// A containing the contents of the retrieved page, or if the request failed.
/// Thrown if is not an absolute URI.
/// Thrown if is .
/// Thrown if a web-related error was encountered somewhere within the request process.
/// This method uses the values of , , and when composing the appropriate .
/// This method uses and modifies the property to update the cookies that the websites browsed modify, just like a web browser would. It also handles page redirections via recursion.
///
///
///
///
///
public string SendRequest(Uri destination)
{
return this.SendRequest(destination, null, null);
}
///
/// Sends a web request to the specified URI, using the given POST data and referer string, and returns the resulting page contents.
///
/// A containing the absolute URI of the page to request.
/// The POST data to send with this request.
/// A containing the absolute URI of the referer page.
/// A containing the contents of the retrieved page, or if the request failed.
/// Thrown if or are not absolute URIs.
/// Thrown if is .
/// Thrown if a web-related error was encountered somewhere within the request process.
/// This method uses the values of , , and when composing the appropriate .
/// This method uses and modifies the property to update the cookies that the websites browsed modify, just like a web browser would. It also handles page redirections via recursion.
///
///
///
///
///
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "referer", Justification = "“Referer” is the correct word when discussing the HTTP protocol.")]
public string SendRequest(Uri destination, string postData, Uri referer)
{
return this.SendRequest(destination, postData, referer, 0);
}
private string SendRequest(Uri destination, string postData, Uri referer, int repeatsSoFar)
{
if (destination == null)
{
throw new ArgumentNullException("destination", "No destination was given for the request.");
}
if (!destination.IsAbsoluteUri)
{
throw new ArgumentException("The destination must be an absolute URI.", "destination");
}
if (referer != null && !referer.IsAbsoluteUri)
{
throw new ArgumentException("The referer, if passed, must be an absolute URI.", "referer");
}
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(destination);
webRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
webRequest.AllowAutoRedirect = false;
webRequest.UserAgent = this.UserAgent;
webRequest.CookieContainer = new CookieContainer();
webRequest.CachePolicy = new System.Net.Cache.RequestCachePolicy(System.Net.Cache.RequestCacheLevel.NoCacheNoStore);
webRequest.Referer = referer != null ? referer.ToString() : null;
if (this.proxyHost != null && this.proxyPort >= 0)
{
webRequest.Proxy = new WebProxy(this.proxyHost, this.proxyPort);
}
// Add cookies for this request
webRequest.CookieContainer.Add(this.Cookies.GetCookies(destination));
// Write the request
if (postData != null)
{
byte[] requestBytes = Encoding.ASCII.GetBytes(postData);
webRequest.Method = "POST";
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.ContentLength = requestBytes.Length;
using (Stream reqStream = webRequest.GetRequestStream())
{
reqStream.Write(requestBytes, 0, requestBytes.Length);
}
}
// Get a response (note that the HaveResponse property only works _after_ calling GetResponse).
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
if (webRequest.HaveResponse)
{
// Handle returned cookies
int numCookiesBefore = this.Cookies.Count;
if (!string.IsNullOrEmpty(webResponse.Headers[HttpResponseHeader.SetCookie]))
{
this.Cookies.SetCookies(webResponse.ResponseUri, webResponse.Headers[HttpResponseHeader.SetCookie]);
}
// Read the response text.
string responseText;
using (Stream responseStream = webResponse.GetResponseStream())
{
using (StreamReader responseReader = new StreamReader(responseStream))
{
responseText = responseReader.ReadToEnd();
}
}
// Handle redirection headers
if (webResponse.StatusCode == HttpStatusCode.Found ||
webResponse.StatusCode == HttpStatusCode.Redirect ||
webResponse.StatusCode == HttpStatusCode.Moved ||
webResponse.StatusCode == HttpStatusCode.MovedPermanently)
{
Uri location = new Uri(destination, new Uri(webResponse.Headers["location"], UriKind.RelativeOrAbsolute));
HttpStatusCode status = webResponse.StatusCode;
webResponse.Close();
return this.SendRequest(location, status != HttpStatusCode.MovedPermanently ? null : postData, destination, repeatsSoFar);
}
// Otherwise just return the response (no more recursion).
webResponse.Close();
return responseText;
}
// No response
throw new WebException("No response received from host.");
}
#endregion
#region Properties
///
/// Gets or sets the host name of the proxy server to use in crawling the web.
///
/// A host name, used in setting the property when crawling the web, or null if no proxy is to be used.
/// The host name property does not include the protocol specification (http://).
/// Thrown if the given host name, with "http://" prepended, does not form a valid URI.
[System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Usage", "CA1806:DoNotIgnoreMethodResults", MessageId = "System.Uri", Justification = "The Uri constructor is called in order to generate UriFormatExceptions when appropriate; its results are not actually needed, but its side effects are.")]
public string ProxyHost
{
get { return this.proxyHost; }
set
{
if (string.IsNullOrEmpty(value))
{
this.proxyHost = null;
return;
}
// Will throw a UriFormatException if the format is invalid. Let the caller catch it.
try
{
new Uri("http://" + value);
}
catch (UriFormatException)
{
this.proxyHost = null;
throw;
}
this.proxyHost = value;
}
}
///
/// Gets or sets the port number of the proxy server to use in crawling the web.
///
/// A port number, used in setting the property when crawling the web.
/// A proxy will be used if and only if is non-.
/// Thrown if the given port number is outside the range 0 through 65535, inclusive.
public int ProxyPort
{
get { return this.proxyPort; }
set
{
if (value < 0 || value > 65535)
{
this.proxyPort = -1;
throw new ArgumentOutOfRangeException("value", "The proxy port must be between 0 and 65535, inclusive.");
}
this.proxyPort = value;
}
}
///
/// Gets or sets the user agent used when crawling the web.
///
/// A user agent string, used in setting the property when crawling the web. Will by default be set to .
///
public string UserAgent
{
get;
set;
}
///
/// A that manages all the cookies associated with this instance.
///
/// The cookies managed by this web-crawling session.
public CookieContainer Cookies
{
get;
private set;
}
#endregion
#region Fields
///
/// The default user agent used, if the property is not set by the user.
///
/// Represents a Firefox 3.6.4 browser on a Windows 7 system.
///
public const string DefaultUserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.4) Gecko/20100611 Firefox/3.6.4";
private string proxyHost;
private int proxyPort = -1;
#endregion
}
}