Skip to content

Instantly share code, notes, and snippets.

@theraot
Created September 24, 2016 09:58
Show Gist options
  • Save theraot/e743751f0b4c8817e4991d09fd8b794a to your computer and use it in GitHub Desktop.
Save theraot/e743751f0b4c8817e4991d09fd8b794a to your computer and use it in GitHub Desktop.
using System;
using System.IO;
using System.Net;
using Theraot.Core;
namespace AWDC
{
internal class Program
{
private static void Main()
{
Console.WriteLine("Welcome to Automatic Web Data Collector by Theraot");
string pageUri;
string uriPrefix;
string file;
int fields;
do
{
Console.WriteLine("Enter the URI of the web page to explore: ");
pageUri = Console.ReadLine();
} while (string.IsNullOrEmpty(pageUri));
do
{
Console.WriteLine("Enter URI prefix to explore: ");
uriPrefix = Console.ReadLine();
} while (string.IsNullOrEmpty(uriPrefix));
do
{
Console.WriteLine("Enter file to write to: ");
file = Console.ReadLine();
} while (string.IsNullOrEmpty(file));
do
{
Console.WriteLine("How many data fields to read: ");
} while (!int.TryParse(Console.ReadLine(), out fields));
var fieldPrefixes = new string[fields];
var fieldPostfixes = new string[fields];
for (var index = 0; index < fields; index++)
{
do
{
Console.WriteLine($"Enter data field prefix #{index + 1}: ");
fieldPrefixes[index] = Console.ReadLine();
} while (fieldPrefixes[index] == null);
Console.WriteLine($"Enter data field postfix #{index + 1}: ");
fieldPostfixes[index] = Console.ReadLine();
if (StringHelper.IsNullOrWhiteSpace(fieldPostfixes[index]))
{
fieldPostfixes[index] = " ";
}
}
Work(pageUri, uriPrefix, fields, fieldPrefixes, fieldPostfixes, file);
}
private static void Work(string pageUri, string uriPrefix, int fields, string[] fieldPrefixes, string[] fieldPostfixes, string file)
{
using (var client = new WebClient())
{
using (var fileWriter = new StreamWriter(file))
{
var pageString = client.DownloadString(pageUri);
var mainProcessor = new StringProcessor(pageString);
while (true)
{
if (mainProcessor.ReadUntilAfter("href") == null)
{
break;
}
mainProcessor.ReadWhile(char.IsWhiteSpace);
if (!mainProcessor.Read("="))
{
continue;
}
mainProcessor.ReadWhile(char.IsWhiteSpace);
string href;
if (mainProcessor.Read('"'))
{
href = mainProcessor.ReadWhile(c => c != '"');
mainProcessor.Read('"');
}
else
{
href = mainProcessor.ReadWhile(c => c != ' ');
}
mainProcessor.ReadWhile(char.IsWhiteSpace);
if (href == null || !href.StartsWith(uriPrefix))
{
continue;
}
var dataPage = client.DownloadString(href);
var writer = fileWriter;
ProcessPage(fields, dataPage, fieldPrefixes, fieldPostfixes, data =>
{
foreach (var s in data)
{
writer.Write(s);
writer.Write('\t');
}
writer.WriteLine();
writer.Flush();
Console.Write(".");
});
}
}
}
}
private static void ProcessPage(int fields, string dataPage, string[] fieldPrefixes, string[] fieldPostfixes, Action<string[]> add)
{
var data = new string[fields];
var dataProcessor = new StringProcessor(dataPage);
for (var index = 0; index < fields; index++)
{
dataProcessor.ReadUntilAfter(fieldPrefixes[index]);
SkipTags(dataProcessor);
data[index] = dataProcessor.ReadUntil(fieldPostfixes[index]);
}
add(data);
}
private static void SkipTags(StringProcessor dataProcessor)
{
while (dataProcessor.Read("<"))
{
dataProcessor.ReadUntilAfter(">");
dataProcessor.ReadWhile(char.IsWhiteSpace);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment