Extracting all links from a HTML page
July 30, 2010
The code below is a small class that extracts all links from a HTML page using a regular expression. The method returns a list of URLs, which can include formats such as “#” and “javascript:;”.
If you get proxy authentication problems behind a corporate firewall or proxy, add an app.config with the following lines:
<?xml version="1.0" encoding="utf-8" ?> | |
<configuration> | |
<system.net> | |
<defaultProxy useDefaultCredentials="true" enabled="true"> | |
<proxy autoDetect="True"/> | |
</defaultProxy> | |
</system.net> | |
</configuration> |
This will take the proxy details from IE. I still get issues even with the above configuration, so it is fairly hit and miss depending on the hardware or software you proxy is using.
class Program | |
{ | |
static void Main(string[] args) | |
{ | |
// Example usage: | |
WebClient client = new WebClient(); | |
byte[] buffer = client.DownloadData("http://www.yahoo.jp"); | |
// GetString() extension method is from: | |
// http://www.shrinkrays.net/code-snippets/csharp/an-extension-method-for-converting-a-byte-array-to-a-string.aspx | |
string html = buffer.GetString(); | |
List<string> list = LinkExtractor.Extract(html); | |
foreach (var link in list) | |
{ | |
Console.WriteLine(link); | |
} | |
Console.ReadLine(); | |
} | |
} | |
public class LinkExtractor | |
{ | |
/// <summary> | |
/// Extracts all src and href links from a HTML string. | |
/// </summary> | |
/// <param name="html">The html source</param> | |
/// <returns>A list of links - these will be all links including javascript ones.</returns> | |
public static List<string> Extract(string html) | |
{ | |
List<string> list = new List<string>(); | |
Regex regex = new Regex("(?:href|src)=[\"|']?(.*?)[\"|'|>]+", RegexOptions.Singleline | RegexOptions.CultureInvariant); | |
if (regex.IsMatch(html)) | |
{ | |
foreach (Match match in regex.Matches(html)) | |
{ | |
list.Add(match.Groups[1].Value); | |
} | |
} | |
return list; | |
} | |
} |
I'm Chris Small, a software engineer working in London. This is my tech blog. Find out more about me via Github, Stackoverflow, Resume