- HTML Agility Pack
- HTML Agility Pack Website Tables
- HTML File Agility Pack Table Data
- HTML Agility Pack Selectors
- HTML File Table Extractor
This post follows from our previous post called HTML Agility Pack Website Tables. Here we are extracting data in a table that’s in an HTML file. We are exporting that data to a CSV file.
Here, our source data is a file, not a website URL as it was in the previous post. You can get the HTML file from your browser by viewing the source, and saving that into a file on your computer.
From the previous post, we have made two changes.
- The data source is an HTML file
- We now ask the user to input the source file, the destination file and the delimiting character
using System; using System.IO; using HtmlAgilityPack; namespace AgilityScrapeTableMikeWeb { class Program { static void Main(string[] args) { // Read in an HTML file and output the data in the HTML // table to a CSV file. string strSourcePath = ""; string pathFileOut = ""; string delimiter = ""; Console.WriteLine("Scrape an HTML file for all of its tables and output the table"); Console.WriteLine(@"data to a CSV file. Ctrl+C terminates the program."); Console.WriteLine("\nThis C# console program uses the HTML Agility Pack."); Console.WriteLine("\nEnter the CSV separator character and press Enter. (Normally its a comma)."); Console.WriteLine("If the table data contains commas, try using the pipe (|) character."); delimiter = Console.ReadLine(); if (delimiter.Length != 1) { Console.WriteLine("Separator must be exactly one character."); Console.WriteLine("Press any key to exit the program."); Console.ReadKey(); Environment.Exit(0); } Console.WriteLine("\nEnter an HTML source path and file name:"); Console.WriteLine(@"For example enter c:\temp\a.html"); strSourcePath = Console.ReadLine(); if (strSourcePath.Length < 1) { Console.WriteLine("No file entered."); Console.WriteLine("Press any key to exit the program."); Console.ReadKey(); Environment.Exit(0); } Console.WriteLine("\nEnter an output CSV fileame and path."); Console.WriteLine(@"If the file already exists the data will be appended at the bottom of the file."); Console.WriteLine("If the file does not exits, it will be created."); Console.WriteLine(@"For example enter c:\temp\tbl.csv"); pathFileOut = Console.ReadLine(); if (pathFileOut.Length < 1) { Console.WriteLine("No file entered."); Console.WriteLine("Press any key to exit the program."); Console.ReadKey(); Environment.Exit(0); } // ======================================================================================= string strLineOut = ""; string strInnerText = ""; Console.WriteLine("Scraping tables from: " + pathFileOut); var doc = new HtmlDocument(); doc.Load(strSourcePath); // now its time to output our data with SteamWriter and the Console. using (var sw = new StreamWriter(pathFileOut, true)) { foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table")) { Console.WriteLine("\nFound: " + table.Name); //sw.WriteLine("\nFound: " + table.Name); foreach (HtmlNode row in table.SelectNodes("//tr")) { strLineOut = ""; Console.WriteLine(""); foreach (HtmlNode cell in row.SelectNodes("th|td")) { strInnerText = cell.InnerText; // call a function that parses and cleans any tags // call a function that removes ' Console.Write(cell.InnerText + delimiter); strLineOut = strLineOut + cell.InnerText + delimiter; } // remove last separator from string as it creates column at end strLineOut = strLineOut.Substring(0, strLineOut.Length - 1); sw.WriteLine(strLineOut); } } } Console.WriteLine("\n\nDone! " + pathFileOut); Console.WriteLine("Press any key to end this program."); Console.ReadKey(); } } }