HTML File Table Extractor


This entry is part 5 of 5 in the series HTML Agility Pack

This is a WPF sample program that scrapes an HTML file for the data it contains in all of (if any) its HTML tables. It uses the HTMLAgilityPack. You need to add a reference to the HTMLAgilityPack in your Visual Studio project.

Another way to do this is to use Microsoft Excel. We have a post called Get Data From Web Into Excel.

Another way to do this is to use Python. I don’t yet have an article on this, however, you might check out an article called Reading HTML tables with Pandas at Practical Business Python.

Here is the screenshot of the final result of or program, at run time. Click to enlarge it.

<Window x:Class="HFTE.MainWindow"
        xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
        xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
        xmlns:d="http://schemas.microsoft.com/expression/blend/2008"
        xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006"
        xmlns:local="clr-namespace:HFTE"
        mc:Ignorable="d"
        Background="Ivory"
        WindowStartupLocation="CenterScreen"
        Title="HFTE2" Height="470" Width="710">
<!-- The real work is done in the FileProcessor.cs class -->  
    <StackPanel>
        <Label Margin="14,8,14,2" Content="HTML File Table Extractor (HFTE- pronounced: hefty)" HorizontalContentAlignment="Center" FontWeight="Bold" FontSize="16" Background="Ivory" Foreground="#FF3727D0"/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="HFTE scrapes HTML table data from inside an HTML file and exports only the data to a pipe-delimited CSV file."/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="HFTE extracts all of the data inside all of the tables in the file. No other data is extracted."/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="Browse out to an HTML-formatted text file (.txt, .html) and click OK."/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="A new file with the same name (but with a csv extension) will be created in the same directory as the source file."/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="HFTE delimits the data in the file with the pipe ( | ) character. "/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" FontWeight="Bold" Text="Warning: If the CSV file already exists it will first be deleted!"/>
        <local:XamlFileBrowser Loaded="XamlFileBrowser_Loaded" Margin="6,2,2,2"  />
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="HFTE accepts HTML files that have multiple tables, even if those tables have different numbers of rows and columns."/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="In the file output, HFTE includes the header rows and the data elements."/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="HFTE only outputs the text between those tags and not the tags themselves. "/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="If a data element or header has no data, HFTE will still delimit it with the pipe character. "/>
        <TextBlock Margin="14,0,0,0" TextWrapping="Wrap" Text="HFTE will work whether or not the HTML tables include the thead, tbody or tfoot tags."/>

        <TextBlock x:FieldModifier="public" x:Name="txtBlockResult" Margin="14,14,14,0" 
                   Background="Black" Foreground="White" TextWrapping="Wrap" Padding="10,2,2,2">
            <Bold>Results:</Bold><LineBreak/>
            <!-- the FileProcessor.cs will add to this with the results... -->
        </TextBlock>
        <Label Margin="10,6,0,0">
            <Hyperlink NavigateUri="https://begincodingnow.com" RequestNavigate="Hyperlink_RequestNavigate">
                
Home
</Hyperlink> </Label> </StackPanel> </Window>

Here is the C# procedural code in the MainWindow.xaml.cs file.

using System.Windows;
using System.Diagnostics;
using System.Windows.Navigation;
namespace HFTE
{
    public partial class MainWindow : Window
    {
        public MainWindow()
        {
            InitializeComponent();

            string strAppName = System.Reflection.Assembly.GetExecutingAssembly().GetName().Name;
            string strAppVersion = System.Reflection.Assembly.GetExecutingAssembly().GetName().Version.ToString();
            this.Title = strAppName + "  v." + strAppVersion + " - begincodingnow.com";
        }
        public void Hyperlink_RequestNavigate(object sender, RequestNavigateEventArgs e)
        {
            Process.Start(new ProcessStartInfo(e.Uri.AbsoluteUri));
            e.Handled = true;
        }
    }
}

The main work of this program is done in a separate class file called FileProcessor.cs.

using System;
using System.Windows;
using System.IO;
using HtmlAgilityPack;
using System.Linq;
using System.Windows.Documents;
// HFTE2
namespace HFTE
{
    class FileProcessor
    {
        public static string strAppName { get; set; }
        public static string strAppVersion { get; set; }
        public static string strLineOut { get; set; }
        public static string strOuterText { get; set; }
        public static string pathFileOut { get; set; }
        public static string strSourceExtension { get; set; }
        public static string strCSVFileName { get; set; }
        public static string strInnerText { get; set; }
        public static int tcount { get; set; }
        public static int thtdcount { get; set; }
        public static int trcount { get; set; }
        public static int linesWrittenOut { get; set; }
        public static string delimiter { get; set; }
        public static bool csvFileWasCreated { get; set; }
        public static bool outputFileAlreadyExists { get; set; }
        public static MainWindow win { get; set; }
        public static string strFileDeleted { get; set; }

        public static void ProcessFile(string strSourcePath)
        {
            //
            strAppName = System.Reflection.Assembly.GetExecutingAssembly().GetName().Name;
            strAppVersion = System.Reflection.Assembly.GetExecutingAssembly().GetName().Version.ToString();
            pathFileOut = "";
            strSourceExtension = "";
            linesWrittenOut = 0;
            tcount = 0;   // tables
            trcount = 0;  // rows
            thtdcount = 0;  // th or td (th plus td)
            csvFileWasCreated = false;

            if (strSourcePath != "")
            {
                strSourceExtension = Path.GetExtension(strSourcePath);
                strCSVFileName = Path.GetFileNameWithoutExtension(strSourcePath) + ".csv";

                MessageBoxResult result = MessageBox.Show("Write to CSV file\n" + strCSVFileName + "\nIn the same folder?" + 
                    "\n\nWarning: If the file exists it will be over-written.", strAppName + "  " + 
                    strAppVersion, MessageBoxButton.OKCancel, MessageBoxImage.Warning);
                if (result == MessageBoxResult.OK)
                {
                    strLineOut = "";
                    strInnerText = "";
                    strOuterText = "";
                    delimiter = "|";

                    // we are going to output a file with a similar name as the input file
                    pathFileOut = strSourcePath.Substring(0, (strSourcePath.Length - (Path.GetExtension(strSourcePath).Length) + 1)) + "csv";

                    var doc = new HtmlDocument();
                    doc.Load(strSourcePath);

                    win = (MainWindow)Application.Current.MainWindow;
                    //win.lblCompletedFileName.Content = "";  // looks better from users point of view.

                    outputFileAlreadyExists = (File.Exists(pathFileOut)) ? true : false;

                    // if we wrote a file and one already existed then we must have deleted it first
                    // because that's what StreamWriter does in WriteCSVFile()
                    if (WriteCSVFile(strSourcePath))
                    {
                        strFileDeleted = outputFileAlreadyExists ? "File Deleted.  " : strFileDeleted = "";

                        // Output the results messages to the main window.
                        //win.txtBlockResult.Inlines.Add(new Run("Results:\n") { FontWeight = FontWeights.Bold });
                        string results = tcount.ToString() + " table(s) found.";
                        results += "\n" + thtdcount.ToString() + " data pieces found (including headers)";
                        results += outputFileAlreadyExists ? "\nFile Deleted:  " + pathFileOut :  "";
                        results += "\nFile Created: " + pathFileOut;
                        results += "\n" + linesWrittenOut.ToString() + " rows written to file.";
                        results += "\nDone.";
                        win.txtBlockResult.Text = "";
                        win.txtBlockResult.Inlines.Add(new Run("Results:\n") { FontWeight = FontWeights.Bold });
                        win.txtBlockResult.Inlines.Add(results);
                    }
                    else
                    {
                        // if there were no tables found in file the we know that we did not write a file.
                        win.txtBlockResult.Text = "";
                        win.txtBlockResult.Inlines.Add(new Run("Results:\n") { FontWeight = FontWeights.Bold });
                        win.txtBlockResult.Inlines.Add("No HTML tables found in file. \nCSV file not created.");
                    }
                }
            } // OK
            else
            {
                MessageBox.Show("No file selected.", strAppName);
            }
        
        } // ProcessFile()

        // =============================================================================================================
        private static bool WriteCSVFile(string strSourcePath)
        {
            try
            {
                HtmlDocument doc = new HtmlDocument();
                doc.Load(strSourcePath);

                var tbls = doc.DocumentNode.SelectNodes("//table");
                if (tbls != null)
                {
                    // if we found a table in the file then we will write a new file
                    // even if the tble is absolutely empty of data and has nothing but
                    // empty data elements, such as <th></th> <td></td> <td></td> and so on
                    pathFileOut = strSourcePath.Substring(0, (strSourcePath.Length - (Path.GetExtension(strSourcePath).Length) + 1)) + "csv";
                    using (var sw = new StreamWriter(pathFileOut, false))
                    {
                        foreach (var table in tbls)
                        {
                            tcount = tcount + 1;  // count the tables in the file.
                            // Stack Overflow: (linq code adopted from):
                            // stackoverflow.com/questions/12168300/parse-table-with-html-agility-pack
                            // So the tricky part is when some tables have the <tbody> tag and some do not.
                            // To overcome we use LINQ. We are taking <table> and simply looking for <tr>
                            // descendents, regardless of how far down we have to go, perhaps bypassing
                            // the <tbody> <thead> or <tfoot> tags if there are any.
                            // We don't care about <thead> <tbody> and <tfoot>.
                            // This way, the code here works whether or not we have these
                            // <tbody> <thead> or <tfoot> tags.
                            foreach (HtmlNode row in table.Descendants().Where(desc =>
                                      desc.Name.Equals("tr", StringComparison.OrdinalIgnoreCase)))
                            {
                                strLineOut = "";
                                strInnerText = "";
                                // Once we've selected our rows, we could see <th> or <td>
                                // We do want both of these, and we want the <td>s within them.
                                foreach (var cell in row.SelectNodes("th|td"))
                                {
                                    strOuterText = cell.OuterHtml;
                                    strInnerText = cell.InnerText;
                                    // ========================================================================================
                                    // There may be a lot of "junk" inside the <td> and <th>, or not.
                                    // We can remove that junk with the following OPTIONAL lines of code:
                                    strInnerText = StringFunctions.RemoveCRLFTab(strInnerText); // remove CR and LF and Tab. \r \n \t
                                    //strInnerText = StringFunctions.RemoveSpanTags(strOuterText);  // NOT WORKING
                                    strInnerText = StringFunctions.RemoveHTMLEntities(strInnerText); // HTML entities like &#160;
                                    //strInnerText = StringFunctions.RemoveEverythingPreceedingColonIncludingColon(strInnerText);
                                    //strInnerText = StringFunctions.ReplaceMultipleSpacesWithSingleSpace(strInnerText);
                                    // ==========================================================================================
                                    strInnerText = strInnerText.Trim();
                                    strLineOut = strLineOut + strInnerText + delimiter;
                                    thtdcount = thtdcount + 1;
                                } // foreach cell (meaning td or th)
                                trcount = trcount + 1;
                                // remove last separator from string as it creates column at end
                                if (strLineOut.Length > 3)
                                {
                                    strLineOut = strLineOut.Substring(0, strLineOut.Length - 1);
                                }

                                if (strLineOut.Length > 0)
                                {
                                    sw.WriteLine(strLineOut);
                                    linesWrittenOut = linesWrittenOut + 1;
                                }
                            } // foreach row in table
                        } // foreach table
                        
                    }  // StreamWriter

                    return true;

                } // has tables!
                else  // does not have any tables.
                {
                    linesWrittenOut = 0;
                    tcount = 0;
                    thtdcount = 0;
                    trcount = 0;
                    csvFileWasCreated = false;
                    return false;
                }
            }  // try
            catch (IOException e)
            {
                //
                MessageBox.Show(e.Source, strAppName + "  " + strAppVersion, MessageBoxButton.OK);
                return false;
            }
        }
    }
}

Series Navigation<< HTML Agility Pack Selectors