using System; using System.IO; using System.Net; using System.Text; namespace DomenicDenicola { /// /// Provides methods for performing web requests. /// public class WebCrawler { [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Performance", "CA1810:InitializeReferenceTypeStaticFieldsInline", Justification = "This static constructor performs an operation that must take place before any use of WebCrawler, and is not simply field initialization.")] static WebCrawler() { // This makes it unconditionally accept all server certifications. ServicePointManager.ServerCertificateValidationCallback += (sender, cert, chain, policyErrors) => true; } public WebCrawler() { this.Cookies = new CookieContainer(); this.UserAgent = WebCrawler.DefaultUserAgent; } #region Web-navigating methods /// /// Sends a web request to the specified URI and returns the resulting page contents. /// /// A containing the absolute URI of the page to request. /// A containing the contents of the retrieved page, or if the request failed. /// Thrown if is not an absolute URI. /// Thrown if is . /// Thrown if a web-related error was encountered somewhere within the request process. /// This method uses the values of , , and when composing the appropriate . /// This method uses and modifies the property to update the cookies that the websites browsed modify, just like a web browser would. It also handles page redirections via recursion. /// /// /// /// /// public string SendRequest(Uri destination) { return this.SendRequest(destination, null, null); } /// /// Sends a web request to the specified URI, using the given POST data and referer string, and returns the resulting page contents. /// /// A containing the absolute URI of the page to request. /// The POST data to send with this request. /// A containing the absolute URI of the referer page. /// A containing the contents of the retrieved page, or if the request failed. /// Thrown if or are not absolute URIs. /// Thrown if is . /// Thrown if a web-related error was encountered somewhere within the request process. /// This method uses the values of , , and when composing the appropriate . /// This method uses and modifies the property to update the cookies that the websites browsed modify, just like a web browser would. It also handles page redirections via recursion. /// /// /// /// /// [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Naming", "CA1704:IdentifiersShouldBeSpelledCorrectly", MessageId = "referer", Justification = "“Referer” is the correct word when discussing the HTTP protocol.")] public string SendRequest(Uri destination, string postData, Uri referer) { return this.SendRequest(destination, postData, referer, 0); } private string SendRequest(Uri destination, string postData, Uri referer, int repeatsSoFar) { if (destination == null) { throw new ArgumentNullException("destination", "No destination was given for the request."); } if (!destination.IsAbsoluteUri) { throw new ArgumentException("The destination must be an absolute URI.", "destination"); } if (referer != null && !referer.IsAbsoluteUri) { throw new ArgumentException("The referer, if passed, must be an absolute URI.", "referer"); } HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(destination); webRequest.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"; webRequest.AllowAutoRedirect = false; webRequest.UserAgent = this.UserAgent; webRequest.CookieContainer = new CookieContainer(); webRequest.CachePolicy = new System.Net.Cache.RequestCachePolicy(System.Net.Cache.RequestCacheLevel.NoCacheNoStore); webRequest.Referer = referer != null ? referer.ToString() : null; if (this.proxyHost != null && this.proxyPort >= 0) { webRequest.Proxy = new WebProxy(this.proxyHost, this.proxyPort); } // Add cookies for this request webRequest.CookieContainer.Add(this.Cookies.GetCookies(destination)); // Write the request if (postData != null) { byte[] requestBytes = Encoding.ASCII.GetBytes(postData); webRequest.Method = "POST"; webRequest.ContentType = "application/x-www-form-urlencoded"; webRequest.ContentLength = requestBytes.Length; using (Stream reqStream = webRequest.GetRequestStream()) { reqStream.Write(requestBytes, 0, requestBytes.Length); } } // Get a response (note that the HaveResponse property only works _after_ calling GetResponse). HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse(); if (webRequest.HaveResponse) { // Handle returned cookies int numCookiesBefore = this.Cookies.Count; if (!string.IsNullOrEmpty(webResponse.Headers[HttpResponseHeader.SetCookie])) { this.Cookies.SetCookies(webResponse.ResponseUri, webResponse.Headers[HttpResponseHeader.SetCookie]); } // Read the response text. string responseText; using (Stream responseStream = webResponse.GetResponseStream()) { using (StreamReader responseReader = new StreamReader(responseStream)) { responseText = responseReader.ReadToEnd(); } } // Handle redirection headers if (webResponse.StatusCode == HttpStatusCode.Found || webResponse.StatusCode == HttpStatusCode.Redirect || webResponse.StatusCode == HttpStatusCode.Moved || webResponse.StatusCode == HttpStatusCode.MovedPermanently) { Uri location = new Uri(destination, new Uri(webResponse.Headers["location"], UriKind.RelativeOrAbsolute)); HttpStatusCode status = webResponse.StatusCode; webResponse.Close(); return this.SendRequest(location, status != HttpStatusCode.MovedPermanently ? null : postData, destination, repeatsSoFar); } // Otherwise just return the response (no more recursion). webResponse.Close(); return responseText; } // No response throw new WebException("No response received from host."); } #endregion #region Properties /// /// Gets or sets the host name of the proxy server to use in crawling the web. /// /// A host name, used in setting the property when crawling the web, or null if no proxy is to be used. /// The host name property does not include the protocol specification (http://). /// Thrown if the given host name, with "http://" prepended, does not form a valid URI. [System.Diagnostics.CodeAnalysis.SuppressMessage("Microsoft.Usage", "CA1806:DoNotIgnoreMethodResults", MessageId = "System.Uri", Justification = "The Uri constructor is called in order to generate UriFormatExceptions when appropriate; its results are not actually needed, but its side effects are.")] public string ProxyHost { get { return this.proxyHost; } set { if (string.IsNullOrEmpty(value)) { this.proxyHost = null; return; } // Will throw a UriFormatException if the format is invalid. Let the caller catch it. try { new Uri("http://" + value); } catch (UriFormatException) { this.proxyHost = null; throw; } this.proxyHost = value; } } /// /// Gets or sets the port number of the proxy server to use in crawling the web. /// /// A port number, used in setting the property when crawling the web. /// A proxy will be used if and only if is non-. /// Thrown if the given port number is outside the range 0 through 65535, inclusive. public int ProxyPort { get { return this.proxyPort; } set { if (value < 0 || value > 65535) { this.proxyPort = -1; throw new ArgumentOutOfRangeException("value", "The proxy port must be between 0 and 65535, inclusive."); } this.proxyPort = value; } } /// /// Gets or sets the user agent used when crawling the web. /// /// A user agent string, used in setting the property when crawling the web. Will by default be set to . /// public string UserAgent { get; set; } /// /// A that manages all the cookies associated with this instance. /// /// The cookies managed by this web-crawling session. public CookieContainer Cookies { get; private set; } #endregion #region Fields /// /// The default user agent used, if the property is not set by the user. /// /// Represents a Firefox 3.6.4 browser on a Windows 7 system. /// public const string DefaultUserAgent = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.4) Gecko/20100611 Firefox/3.6.4"; private string proxyHost; private int proxyPort = -1; #endregion } }