HTML File Agility Pack Table Data


This entry is part 3 of 5 in the series HTML Agility Pack

This post follows from our previous post called HTML Agility Pack Website Tables. Here we are extracting data in a table that’s in an HTML file. We are exporting that data to a CSV file.

Here, our source data is a file, not a website URL as it was in the previous post. You can get the HTML file from your browser by viewing the source, and saving that into a file on your computer.

From the previous post, we have made two changes.

  • The data source is an HTML file
  • We now ask the user to input the source file, the destination file and the delimiting character
using System;
using System.IO;
using HtmlAgilityPack;

namespace AgilityScrapeTableMikeWeb
{
    class Program
    {
        static void Main(string[] args)
        {

            // Read in an HTML file and output the data in the HTML
            // table to a CSV file.
            string strSourcePath = "";
            string pathFileOut = "";
            string delimiter = "";

            Console.WriteLine("Scrape an HTML file for all of its tables and output the table");
            Console.WriteLine(@"data to a CSV file. Ctrl+C terminates the program.");
            Console.WriteLine("\nThis C# console program uses the HTML Agility Pack.");


            Console.WriteLine("\nEnter the CSV separator character and press Enter. (Normally its a comma).");
            Console.WriteLine("If the table data contains commas, try using the pipe (|) character.");
            delimiter = Console.ReadLine();
            if (delimiter.Length != 1)
            {
                Console.WriteLine("Separator must be exactly one character.");
                Console.WriteLine("Press any key to exit the program.");
                Console.ReadKey();
                Environment.Exit(0);
            }
            Console.WriteLine("\nEnter an HTML source path and file name:");
            Console.WriteLine(@"For example enter c:\temp\a.html");
            strSourcePath = Console.ReadLine();
            if (strSourcePath.Length < 1)
            {
                Console.WriteLine("No file entered.");
                Console.WriteLine("Press any key to exit the program.");
                Console.ReadKey();
                Environment.Exit(0);
            }

            Console.WriteLine("\nEnter an output CSV fileame and path.");
            Console.WriteLine(@"If the file already exists the data will be appended at the bottom of the file.");
            Console.WriteLine("If the file does not exits, it will be created.");
            Console.WriteLine(@"For example enter c:\temp\tbl.csv");
            pathFileOut = Console.ReadLine();
            if (pathFileOut.Length < 1)
            {
                Console.WriteLine("No file entered.");
                Console.WriteLine("Press any key to exit the program.");
                Console.ReadKey();
                Environment.Exit(0);
            }

            // =======================================================================================
            string strLineOut = "";
            string strInnerText = "";
            Console.WriteLine("Scraping tables from: " + pathFileOut);
            var doc = new HtmlDocument();
            doc.Load(strSourcePath);

            // now its time to output our data with SteamWriter and the Console.
            using (var sw = new StreamWriter(pathFileOut, true))
            {
                foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table"))
                {
                    Console.WriteLine("\nFound: " + table.Name);
                    //sw.WriteLine("\nFound: " + table.Name);
                    foreach (HtmlNode row in table.SelectNodes("//tr"))
                    {
                        strLineOut = "";
                        Console.WriteLine("");
                        foreach (HtmlNode cell in row.SelectNodes("th|td"))
                        {
                            strInnerText = cell.InnerText;
                            // call a function that parses and cleans any tags

                            // call a function that removes '
                            Console.Write(cell.InnerText + delimiter);
                            strLineOut = strLineOut + cell.InnerText + delimiter;
                        }
                        // remove last separator from string as it creates column at end
                        strLineOut = strLineOut.Substring(0, strLineOut.Length - 1);
                        sw.WriteLine(strLineOut);
                    }
                }
            }
            Console.WriteLine("\n\nDone!  " + pathFileOut);
            Console.WriteLine("Press any key to end this program.");
            Console.ReadKey();
        }
    }
}
Series Navigation<< HTML Agility Pack Website TablesHTML Agility Pack Selectors >>