Easily retrieve HTML content from websites using C# and HTML Agility Pack (HtmlAgilityPack)

I needed to retrieve a chunk of content (a product price) within a certain DIV (<div class=”price”>) on a website for a large number of search terms. The quickest way of doing this was to use the HtmlAgilityPack library for .NET to call the website’s search page (with custom search terms) and then pull out the content I needed using the SelectNodes() method. SelectNodes() uses XPATH to search across the document, which sounds tricky but is easy when you know some basic rules. XPATH is described well on w3schools if you need to know the syntax. You will need to download the Html Agility Pack .DLL and make a reference to it in your code.

The following is a simplified version of my code with a file reader/writer to read in a text file containing search terms line by line and output the result to another text file. The exact code will change depending on the website you are using, especially the XPATH in SelectNodes():

using System;
using System.IO;
using HtmlAgilityPack;

namespace webget
{  class Program
  {
    static void Main(string[] args)
    {
      string row = "", line = "";
      StreamReader infile = new StreamReader("input.csv");
      StreamWriter outfile = new StreamWriter("output.csv");

      // Create the Html Agility Pack object
      HtmlWeb hw = new HtmlWeb();

      while ((line = infile.ReadLine()) != null)
      {
        // load the website and store in a Htmldocument object
        HtmlDocument doc = hw.Load("http://www.website.com/search.php?keywords=" + line.Trim());

        try
        {
          // loop through every DIV with class "price" on the website and extract the content
          foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//div[@class='price']"))
          {
            // clean up the content by removing any standard text items and the £ symbol
            string price = link.InnerText.Replace("£", "").Replace("r", "").Replace("t", "").Replace("n", "").Trim();
            line += price + ",";
          }
        }
        catch
        {
          line += ",ERROR";
        }

        Console.WriteLine(line);
        outfile.WriteLine(line);
      }

     infile.Close();
     outfile.Close();
   }
  }
}

Connecting to SQL Server from C++

Despite being quite a basic requirement, it was very difficult to find anything solid on connecting to a SQL database without using CLR or 3rd party libraries in visual C++. This code was tested and works when connecting to a SQL Server Express 2005 server in Visual Studio 2010 with a win32 console project using visual C++. Thanks to Tidy Tutorials, who look to have scalped it from MSDN (although I couldn’t find it anywhere).

#include <iostream>
#include <windows.h>
#include <sqltypes.h>
#include <sql.h>
#include <sqlext.h>

using namespace std;

void show_error(unsigned int handletype, const SQLHANDLE& handle){
    SQLCHAR sqlstate[1024];
    SQLCHAR message[1024];
    if(SQL_SUCCESS == SQLGetDiagRec(handletype, handle, 1, sqlstate, NULL, message, 1024, NULL))
        cout<<"Message: "<<message<<"nSQLSTATE: "<<sqlstate<<endl;
}

int main(){

    SQLHANDLE sqlenvhandle;    
    SQLHANDLE sqlconnectionhandle;
    SQLHANDLE sqlstatementhandle;
    SQLRETURN retcode;

    if(SQL_SUCCESS!=SQLAllocHandle(SQL_HANDLE_ENV, SQL_NULL_HANDLE, &sqlenvhandle))
        goto FINISHED;

    if(SQL_SUCCESS!=SQLSetEnvAttr(sqlenvhandle,SQL_ATTR_ODBC_VERSION, (SQLPOINTER)SQL_OV_ODBC3, 0)) 
        goto FINISHED;
    
    if(SQL_SUCCESS!=SQLAllocHandle(SQL_HANDLE_DBC, sqlenvhandle, &sqlconnectionhandle))
        goto FINISHED;

    SQLCHAR retconstring[1024];
    switch(SQLDriverConnect (sqlconnectionhandle, 
                NULL, 
                (SQLCHAR*)"DRIVER={SQL Server};SERVER=localhost, 1433;DATABASE=MyDatabase;UID=sa;PWD=Admin-123;", 
                SQL_NTS, 
                retconstring, 
                1024, 
                NULL,
                SQL_DRIVER_NOPROMPT)){
        case SQL_SUCCESS_WITH_INFO:
            show_error(SQL_HANDLE_DBC, sqlconnectionhandle);
            break;
        case SQL_INVALID_HANDLE:
        case SQL_ERROR:
            show_error(SQL_HANDLE_DBC, sqlconnectionhandle);
            goto FINISHED;
        default:
            break;
    }
    
    if(SQL_SUCCESS!=SQLAllocHandle(SQL_HANDLE_STMT, sqlconnectionhandle, &sqlstatementhandle))
        goto FINISHED;

    if(SQL_SUCCESS!=SQLExecDirect(sqlstatementhandle, (SQLCHAR*)"select * from testtable", SQL_NTS)){
        show_error(SQL_HANDLE_STMT, sqlstatementhandle);
        goto FINISHED;
    }
    else{
        char name[64];
        char address[64];
        int id;
        while(SQLFetch(sqlstatementhandle)==SQL_SUCCESS){
            SQLGetData(sqlstatementhandle, 1, SQL_C_ULONG, &id, 0, NULL);
            SQLGetData(sqlstatementhandle, 2, SQL_C_CHAR, name, 64, NULL);
            SQLGetData(sqlstatementhandle, 3, SQL_C_CHAR, address, 64, NULL);
            cout<<id<<" "<<name<<" "<<address<<endl;
        }
    }

FINISHED:
    SQLFreeHandle(SQL_HANDLE_STMT, sqlstatementhandle );
    SQLDisconnect(sqlconnectionhandle);
    SQLFreeHandle(SQL_HANDLE_DBC, sqlconnectionhandle);
    SQLFreeHandle(SQL_HANDLE_ENV, sqlenvhandle);
    
}

Run System Commands in The Background With c#

I needed to run 200 odd batch files sequentially in the background for a speech classifier. The code to do this involved writing a batch file in code to include the commands i wanted the HTK to run, probably not the best way of doing this but HTK is funny with file paths with spaces etc and this was the quickest way.

Process hvite = new Process();
hvite.EnableRaisingEvents = false;
hvite.StartInfo.FileName = “temp.bat”;
hvite.StartInfo.CreateNoWindow = true;
hvite.StartInfo.WindowStyle = ProcessWindowStyle.Hidden;
hvite.Start();
hvite.WaitForExit();

Converting CSV and Vector Data to Native HTK Format Using C#

The output of my Principal Component Analysis in Matlab to reduce the dimensionality of gesture data is in the comma separated variable format. 57 dimension data goes in, X dimension data comes out in standard csv format. It is better to remove unnecessary information from the gesture data as it only makes the recognition of gesture (and intent, in my case) more difficult.

The problem is that the HTK, which I am going to use to perform recognition, doesn’t natively accept csv data so you have to convert to the HTK binary format parameter files. I chose to do this in c# as I’m familiar with it, but I stumbled across a few problems relating to the conversion between big-endian and little-endian binary data. HTK reads data in the opposite way to my PC (although I’m sure I read on their website somewhere that there is automatic detection for this).

The following code is pretty rough around the edges as it includes a lot of stuff to help me debug it. The program reads a directory and converts all *.csv files into HTK format binary files by reading in the data as floats, converting to bytes, writing a header and then writing the data to a binary file *.csv.bin.

static void Main(string[] args)
{
string dir = @”G:PHD Nov 09# programsmatlab worktest”;
DirectoryInfo di = new DirectoryInfo(dir);

FileInfo[] rgFiles = di.GetFiles(“*.csv”);
foreach (FileInfo fi in rgFiles)
{
using (TextReader tr = new StreamReader(fi.FullName))
{
string data = tr.ReadToEnd();
System.Text.ASCIIEncoding encoding=new System.Text.ASCIIEncoding();
byte[] byteArray = encoding.GetBytes(data);

string newdata = data.Replace(‘n’,’ ‘);
string[] plit = newdata.Trim().Split(‘ ‘);

int samples = plit.Length;
int itemspersample = plit[0].Split(‘,’).Length;

// now create binary data, each sample (part of a line in the file)
// has to be converted from a float to a 4 byte array and then joined to make one
// large binary file

byte[] bytedata = new byte[samples * itemspersample * 4];

for (int i = 0; i < samples; i++)
{
for (int j = 0; j < itemspersample; j++)
{
string dd = plit[i].Split(‘,’)[j];
float f = (float)Convert.ToDouble(plit[i].Split(‘,’)[j]);

byte[] temp = new byte[4];
temp = BitConverter.GetBytes(f);

bytedata[(i * itemspersample * 4) + (j * 4)] = temp[3];
bytedata[(i * itemspersample * 4) + (j * 4) + 1] = temp[2];
bytedata[(i * itemspersample * 4) + (j * 4) + 2] = temp[1];
bytedata[(i * itemspersample * 4) + (j * 4) + 3] = temp[0];
}
}

// now create HTK header 12 bytes long
byte[] nSamples = BitConverter.GetBytes(samples);
byte[] sampPeriod = BitConverter.GetBytes(100000);
byte[] sampSize = BitConverter.GetBytes(Convert.ToInt16(itemspersample * 4));
byte[] parmKind = BitConverter.GetBytes(Convert.ToInt16(9));

using (BinaryWriter bw = new BinaryWriter(File.Open(fi.FullName + “.bin”, FileMode.Create)))
{
Array.Reverse(nSamples);
Array.Reverse(sampPeriod);
Array.Reverse(sampSize);
Array.Reverse(parmKind);
bw.Write(nSamples);
bw.Write(sampPeriod);
bw.Write(sampSize);
bw.Write(parmKind);
bw.Write(bytedata);
}
}
}

To check it works you run HList, with no config file required as the header explains to HTK everything it needs to know about the data:

G:PHD Nov 09# programsmatlab worktest>hlist -h EO412_10PCs.csv.bin
————————- Source: EO412_10PCs.csv.bin ————————
Sample Bytes:  40       Sample Kind:   USER
Num Comps:     10       Sample Period: 10000.0 us
Num Samples:   5        File Format:   HTK
—————————— Samples: 0->-1 ——————————
0:    1838.200 308.910-262.970 401.920 -66.737-499.370 305.260-260.250 -91.974  28.171
1:    1837.700 308.630-263.340 400.810 -67.144-499.920 305.280-260.060 -92.174  27.584
2:    1837.000 308.360-263.750 399.940 -67.870-500.510 305.540-259.960 -91.964  26.922
3:    1836.500 308.160-264.000 398.500 -68.003-501.230 305.790-259.980 -92.138  26.342
4:    1837.000 308.360-263.750 399.940 -67.870-500.510 305.540-259.960 -91.964  26.922
———————————– END ———————————–

Adding SMS Functionality to .net Using a Nokia 6310 & SMSLib

I used to use a Java library for receiving SMS to my phone, which I then wrote to a text file to be picked up periodically by other programs. Obviously this is pretty terrible in practice due to file locks and other filesystem overheads. I was going to rewrite it to use a database but never got round to it until I needed to dig out the functionality again.

The project has moved on since then and I was determined to integrate SMS properly into my C# solution. First I tried using the built in serial port libraries of System.IO.Ports added in C# 2.0 and AT commands to talk to my Nokia 6310.

We decided on the 6310 as it is a reliable cheap phone with a GSM modem, required if you want to use AT commands to talk to it. The Java app I wrote originally used AT commands to read a list of messages at periodic intervals. Developers Home provided a great source of info on the required AT commands to read off all unread SMS messages which I then parsed elsewhere.

When I rewrote the Java app for C# I tried to use all the same AT commands but every time I tried I just got garbled text returned, even when I did everything completely in serial and added pauses for the com port to catch up (serialin is my com port instance):

serialIn.Write(“AT” + System.Environment.NewLine);
Thread.Sleep(500);
Console.WriteLine(serialIn.ReadExisting());
Thread.Sleep(500);
serialIn.WriteLine(“AT+CNMI=2,1,0,0,0″ + System.Environment.NewLine);
Thread.Sleep(500);
Console.WriteLine(serialIn.ReadExisting());
Thread.Sleep(500);
serialIn.WriteLine(“AT+CMGF=1″ + System.Environment.NewLine);
Thread.Sleep(500);
Console.WriteLine(serialIn.ReadExisting());
Thread.Sleep(500);
serialIn.WriteLine(“AT+CMGL=”all”” + System.Environment.NewLine);
Thread.Sleep(3000);
Console.WriteLine(serialIn.ReadExisting());

For reference my serial port was set up to work with my 6310 using a DLR-3P cable I picked up off ebay for £5. The initialization of the port was as follows:

serialIn = new SerialPort();
serialIn.PortName = “COM1″;
serialIn.BaudRate = 2400;
serialIn.DataBits = 8;
serialIn.Parity = Parity.None;
serialIn.StopBits = StopBits.One;
serialIn.Handshake = Handshake.RequestToSend;
serialIn.DtrEnable = true;
serialIn.RtsEnable = true;
serialIn.NewLine = System.Environment.NewLine;

I was confused as to why I was getting back garbled text from the serial port as I had tried these exact same commands in HyperTerminal without any problems. I played around with timings and pauses but whatever I did it still didnt give me the correct SMS message readouts I expected.

Next I tried to find a working tutorial but nothing I did could get the port to send back the correct info as they all used a similar method of sending AT commands. I eventually gave up and found an open source library called SMSlib. This library is written for Java currently but they used to have a C# version, which I had to dig up at a Google Group dedicated to the project and you can download from here.

Although the C# version is depreciated, it works great and with a little bit of threading I integrated SMS receiving into my existing solution. I added the SMSlib project, made a reference to the project in my main project, added the “SMSLib.Service” object and followed the “ReadMessages” example included.

Updating GUI from Other Threads in C#

I’ve been meaning to write this down for a while. There is a really easy way of updating controls and generally updating the GUI of your main thread from another thread using BeginInvoke.

Create a global delegate for your Form class eg:

public delegate void InvokeDelegate(string input);

This means the delegate takes in a single argument (a string in this case). Next create a method to call to update the controls or do anything really. My method just appends text to a textbox:

public void UpdateLog(string input)
{
textboxLog.AppendText(input);
}

Now when you want to call this method from threads other than the main thread you create an object array, put in your input variables and use BeginInvoke to call your delegate which in turn calls your method:

object[] obj = new object[1];
obj[0] = “text to append to my log textbox”
textboxLog.BeginInvoke(new InvokeDelegate(UpdateLog), obj);

Speech Recognition Using Microsoft SAPI

Although this may be a bit out of date considering the Microsoft Speech SDK is out and has been for a while, the method used in this CodeProject article is a great starting point. We are using it to allow people to update a speech recognition grammar using a website. It is actually pretty easy to do this and makes a lot of sense compared to file based grammars if you are just dealing with simple phrases.

The great thing about SAPI is that it’s so easy to implement in any .net project (when you know how of course). We are currently using a database to store our phrases and then updating the grammar on any change to the database using regular polling. This isn’t ideal but is pretty quick & we need to regularly touch the database for other reasons.

Creating Integer Hashes

Today we needed to hash one unsigned integer into another to allow anonymising of user data. I found http://www.concentric.net/~Ttwang/tech/inthash.htm which has a list of hashing algorithms which can be easily used in c#. It’s important to test these hashing algorithms to see if duplicate hashes are created, as they were by the first 2 algorithms tried. The algorithm used in the end was tested for an input integer of 1 to 1,000,000 without any duplicates.


        static void Main(string[] args)
        {
            List<Int32> results = new List<int>();

            for (int c = 1; c < 1000000; c++)
                results.Add(hash32shiftmult(c));

            Console.WriteLine("Added hashes to results table, looking for matches.");

            for (int i = 0; i < results.Count; i++)
            {
                if(i%10000 == 0)
                    Console.Write(i + " ");

                for (int j = 0; j < results.Count; j++)
                {
                    if (i != j)
                        if (results[i].Equals(results[j]))
                            Console.WriteLine("rnfound at " + i + " (" + results[i] + ") and " + j + " (" + results[j] + ")");
                }
            }

            Console.ReadLine();
        }

        static int mix(int a, int b, int c)
        {
          a=a-b;  a=a-c;  a=a^(c >> 13);
          b=b-c;  b=b-a;  b=b^(a << 8);
          c=c-a;  c=c-b;  c=c^(b >> 13);
          a=a-b;  a=a-c;  a=a^(c >> 12);
          b=b-c;  b=b-a;  b=b^(a << 16);
          c=c-a;  c=c-b;  c=c^(b >> 5);
          a=a-b;  a=a-c;  a=a^(c >> 3);
          b=b-c;  b=b-a;  b=b^(a << 10);
          c=c-a;  c=c-b;  c=c^(b >> 15);
          return c;
        }

        static int hash(int a)
        {
           a = (a+0x7ed55d6) + (a<<12);
           a = (a^0xc761c2c) ^ (a>>19);
           a = (a+0x165667b1) + (a<<5);
           a = (a+0xd3a264c) ^ (a<<9);
           a = (a+0xfd70465) + (a<<3);
           a = (a^0xb55a4f9) ^ (a>>16);
           return a;
        }

        static int hash32shiftmult(int key)
        {
          int c2=0x27d4eb2d; // a prime or an odd constant
          key = (key ^ 61) ^ (key >> 16);
          key = key + (key << 3);
          key = key ^ (key >> 4);
          key = key * c2;
          key = key ^ (key >> 15);
          return key;
        }