- HTML Agility Pack
- HTML Agility Pack Website Tables
- HTML File Agility Pack Table Data
- HTML Agility Pack Selectors
- HTML File Table Extractor
This post follows from our previous post called HTML Agility Pack Website Tables. Here we are extracting data in a table that’s in an HTML file. We are exporting that data to a CSV file.
Here, our source data is a file, not a website URL as it was in the previous post. You can get the HTML file from your browser by viewing the source, and saving that into a file on your computer.
From the previous post, we have made two changes.
- The data source is an HTML file
- We now ask the user to input the source file, the destination file and the delimiting character
using System;
using System.IO;
using HtmlAgilityPack;
namespace AgilityScrapeTableMikeWeb
{
class Program
{
static void Main(string[] args)
{
// Read in an HTML file and output the data in the HTML
// table to a CSV file.
string strSourcePath = "";
string pathFileOut = "";
string delimiter = "";
Console.WriteLine("Scrape an HTML file for all of its tables and output the table");
Console.WriteLine(@"data to a CSV file. Ctrl+C terminates the program.");
Console.WriteLine("\nThis C# console program uses the HTML Agility Pack.");
Console.WriteLine("\nEnter the CSV separator character and press Enter. (Normally its a comma).");
Console.WriteLine("If the table data contains commas, try using the pipe (|) character.");
delimiter = Console.ReadLine();
if (delimiter.Length != 1)
{
Console.WriteLine("Separator must be exactly one character.");
Console.WriteLine("Press any key to exit the program.");
Console.ReadKey();
Environment.Exit(0);
}
Console.WriteLine("\nEnter an HTML source path and file name:");
Console.WriteLine(@"For example enter c:\temp\a.html");
strSourcePath = Console.ReadLine();
if (strSourcePath.Length < 1)
{
Console.WriteLine("No file entered.");
Console.WriteLine("Press any key to exit the program.");
Console.ReadKey();
Environment.Exit(0);
}
Console.WriteLine("\nEnter an output CSV fileame and path.");
Console.WriteLine(@"If the file already exists the data will be appended at the bottom of the file.");
Console.WriteLine("If the file does not exits, it will be created.");
Console.WriteLine(@"For example enter c:\temp\tbl.csv");
pathFileOut = Console.ReadLine();
if (pathFileOut.Length < 1)
{
Console.WriteLine("No file entered.");
Console.WriteLine("Press any key to exit the program.");
Console.ReadKey();
Environment.Exit(0);
}
// =======================================================================================
string strLineOut = "";
string strInnerText = "";
Console.WriteLine("Scraping tables from: " + pathFileOut);
var doc = new HtmlDocument();
doc.Load(strSourcePath);
// now its time to output our data with SteamWriter and the Console.
using (var sw = new StreamWriter(pathFileOut, true))
{
foreach (HtmlNode table in doc.DocumentNode.SelectNodes("//table"))
{
Console.WriteLine("\nFound: " + table.Name);
//sw.WriteLine("\nFound: " + table.Name);
foreach (HtmlNode row in table.SelectNodes("//tr"))
{
strLineOut = "";
Console.WriteLine("");
foreach (HtmlNode cell in row.SelectNodes("th|td"))
{
strInnerText = cell.InnerText;
// call a function that parses and cleans any tags
// call a function that removes '
Console.Write(cell.InnerText + delimiter);
strLineOut = strLineOut + cell.InnerText + delimiter;
}
// remove last separator from string as it creates column at end
strLineOut = strLineOut.Substring(0, strLineOut.Length - 1);
sw.WriteLine(strLineOut);
}
}
}
Console.WriteLine("\n\nDone! " + pathFileOut);
Console.WriteLine("Press any key to end this program.");
Console.ReadKey();
}
}
}