23 April 2007

Crawling your competitor's web site

Have a quick look inside your competitor's web site.
Retrieve all the content from the head tag and analize it.


The asp.net code:


<%@ Page Language="VB" AutoEventWireup="false" ValidateRequest="false" %>

<%@ Import Namespace="System.Net" %>
<%@ Import Namespace="System.IO" %>

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<script runat="server">
Protected Sub CrawlWebSiteButton_Click(ByVal sender As Object, ByVal e As
System.EventArgs) Handles CrawlWebSiteButton.Click

Dim connectionString As String = Me.UrlTextBox.Text

Try
Dim myRequest As System.Net.WebRequest = WebRequest.Create(connectionString)
myRequest.Credentials = CredentialCache.DefaultCredentials

Dim myProxy As System.Net.IWebProxy = New WebProxy("MyProxy", 80)
myProxy = myRequest.Proxy

Dim response As System.Net.WebResponse = myRequest.GetResponse
Dim sr As StreamReader = New StreamReader(response.GetResponseStream)

Me.allPageContentTextBox.Text = sr.ReadToEnd

Dim pageContent As String = Me.allPageContentTextBox.Text
Dim startIndex As Integer = pageContent.IndexOf("<head")

Dim endIndex As Integer = pageContent.LastIndexOf("</head>")
Me.RequestStatusLabel.Text = "Request status: " _
+ "Start index: " + startIndex.ToString() _
+ " " _
+ "End index: " + endIndex.ToString()

If startIndex > 0 AndAlso endIndex > 0 AndAlso endIndex > startIndex Then
Me.expectedResultTextBox.Text = pageContent.Substring(startIndex, endIndex -
startIndex + 7)
End If

'// Close stream
sr.Close()

Catch ex As Exception
Me.ErrorLabel.Text = "Error: " + ex.Message
End Try
End Sub
</script>

<html xmlns="http://www.w3.org/1999/xhtml">
<head id="Head1" runat="server">
<title>Header Tag Reader</title>
</head>
<body>
<form id="form1" runat="server">
<div>
<asp:Label ID="Label1" runat="server" Text="Input url:" Font-Names="Georgia,
Serif"
Font-Size="12pt" ForeColor="#800000"></asp:Label><br />
<asp:TextBox ID="UrlTextBox" runat="server" Font-Names="Verdana, Sans-Serif"
Font-Size="9pt"
ForeColor="#434041" Width="291px"></asp:TextBox>&nbsp;&nbsp;<asp:Button
ID="CrawlWebSiteButton" runat="server" Text="Crawl site" />
<asp:RegularExpressionValidator ID="RegularExpressionValidator1"
ControlToValidate="UrlTextBox"
runat="server" ToolTip="Input a valid URL !"
ValidationExpression="http(s)?://([\w-]+\.)+[\w-]+(/[\w-
./?%&=]*)?">*</asp:RegularExpressionValidator>
<asp:RequiredFieldValidator ID="RequiredFieldValidator1" runat="server"
ControlToValidate="UrlTextBox"
ErrorMessage="This field is required!" ToolTip="This field is
required!">*</asp:RequiredFieldValidator>
</div>
<div>
<br />
<br />
<asp:Label ID="Label2" runat="server" Font-Names="Georgia, Serif"
Font-Italic="true"
Font-Size="12pt" ForeColor="#800000" Text="Head content"></asp:Label>
<br />
<asp:TextBox ID="expectedResultTextBox" runat="server" Height="269px"
TextMode="MultiLine"
Font-Names="Verdana, Sans-Serif" Font-Size="9pt" Width="724px"></asp:TextBox>
<br />
<br />
<asp:Label ID="Label3" runat="server" Font-Italic="true" Font-Names="Georgia,
Serif"
Font-Size="12pt" ForeColor="#800000" Text="Page content"></asp:Label>
<br />
<asp:TextBox ID="allPageContentTextBox" runat="server" Height="269px"
TextMode="MultiLine"
Font-Names="Verdana, Sans-Serif" Font-Size="9pt" Width="724px"></asp:TextBox>
</div>
<div>
<br />
<asp:Label ID="RequestStatusLabel" runat="server" Font-Names="Verdana,
Sans-Serif" Font-Size="10pt"
ForeColor="#800000"></asp:Label>
<br />
<br />
<asp:Label ID="ErrorLabel" ForeColor="#FF0000" runat="server"
Font-Names="Verdana, Sans-Serif"
Font-Size="10pt"></asp:Label>
</div>
</form>
</body>
</html>

No comments: