Every day I spend a little time (aka. lunch) and tinker around with side projects or questions that I've raised to myself while browsing the net, or thinking about ideas for the apps I would like to write, that I have already written, or that I wrote and sent to the graveyard. Lately I've been toying with Html Parsing and came upon something I've not yet dealt with much with web development: css selectors. Css selectors are a really neat method of selecting nodes/elements in an html document based on logical ordering and criteria.
An examle of css selectors:
Code: css
- DIV P *[href] {}
- DIV OL > LI P {}
- DIV > P:first-child { text-indent: 0; }
- H1.opener + H2 { margin-top: -5mm; }
- SPAN[hello="Cleveland"][goodbye="Columbus"] { color: blue; }
- .body .forum { width: 800px; }
Now, there's nothing native (outside of a web browser) for .NET that will parse or handle these selectors. That makes sense because there isnt a native HTML DOM parser either. However, Xml documents have great representation in the CLR. After all, most of the websites that webdev does now is all XHTML anyhow, which is just strict HTML based on XML (>> loosely translated <<). XPath is a means to select nodes in an xml document much the same way that css selectors work. So I went on a mission to find a method of translating, or converting, Css selectors to an XPath statement. Lo and behold Joe Hewitt came up with some javascript a few years ago and it's propigated the web thoroughly like any good little script.
So today I translated it into something usable for .NET. The conversion is to C#. If you're a VB.NET developer and you can't read this, and/or can't translate between C# and VB.NET with ease then shame on you. Go make yourself SMRT and learn the C# (read: C) syntax already.
Code: c#
- //Rules verified from http://plasmasturm.org/log/444/
- //Converted from http://www.joehewitt.com/blog/files/getElementsBySelector.js
- public static string CssToXPath(string rule)
- {
- Regex rElement = new Regex(@"^([#.]?)([a-z0-9\\*_-]*)((\|)([a-z0-9\\*_-]*))?", RegexOptions.IgnoreCase | RegexOptions.ECMAScript);
- Regex rAttr1 = new Regex(@"^\[([^\]]*)\]", RegexOptions.IgnoreCase | RegexOptions.ECMAScript);
- Regex rAttr2 = new Regex(@"^\[\s*([^~=\s]+)\s*(~?=)\s*""([^""]+)""\s*\]", RegexOptions.IgnoreCase | RegexOptions.ECMAScript);
- Regex rPseudo = new Regex(@"^:([a-z_-])+", RegexOptions.IgnoreCase | RegexOptions.ECMAScript);
- Regex rCombinator = new Regex(@"^(\s*[>+\s])?", RegexOptions.IgnoreCase | RegexOptions.ECMAScript);
- Regex rComma = new Regex(@"^\s*,", RegexOptions.IgnoreCase | RegexOptions.ECMAScript);
- int index = 1;
- List<string> parts = new List<string>();
- parts.Add("//");
- parts.Add("*");
- string lastRule = null;
- while (rule.Length > 0 && rule != lastRule)
- {
- lastRule = rule;
- // Trim leading whitespace
- rule = Regex.Replace(rule, @"^\s*|\s*$", "");
- if (rule.Length == 0)
- break;
- // Match the element identifier
- Match m = rElement.Match(rule);
- if (m.Success)
- {
- if (m.Groups[1].Length == 0)
- {
- //XXXjoe Namespace ignored for now
- if (m.Groups[5].Length > 0)
- parts[index] = m.Groups[5].Value; //"ns:" + m.Groups[5].Value;
- else
- parts[index] = m.Groups[2].Value; //"ns:" + m.Groups[2].Value;
- }
- else if (m.Groups[1].Value == "#")
- parts.Add("[@id='" + m.Groups[2].Value + "']");
- else if (m.Groups[1].Value == ".")
- parts.Add("[contains(@class, '" + m.Groups[2].Value + "')]");
- rule = rule.Substring(m.Groups[0].Value.Length);
- }
- // Match attribute selectors
- m = rAttr2.Match(rule);
- if (m.Success)
- {
- if (m.Groups[2].Value == "~=")
- parts.Add("[contains(@" + m.Groups[1].Value + ", '" + m.Groups[3].Value + "')]");
- else
- parts.Add("[@" + m.Groups[1].Value + "='" + m.Groups[3].Value + "']");
- rule = rule.Substring(m.Groups[0].Value.Length);
- }
- else
- {
- m = rAttr1.Match(rule);
- if (m.Success)
- {
- parts.Add("[@" + m.Groups[1].Value + "]");
- rule = rule.Substring(m.Groups[0].Value.Length);
- }
- }
- // Skip over pseudo-classes and pseudo-elements, which are of no use to us
- m = rPseudo.Match(rule);
- while (m.Success)
- {
- rule = rule.Substring(m.Groups[0].Value.Length);
- m = m.NextMatch();
- }
- // Match combinators
- m = rCombinator.Match(rule);
- if (m.Success && m.Groups[0].Value.Length > 0)
- {
- if (m.Groups[0].Value.IndexOf(">") != -1)
- parts.Add("/");
- else if (m.Groups[0].Value.IndexOf("+") != -1)
- parts.Add("/following-sibling::");
- else
- parts.Add("//");
- index = parts.Count;
- parts.Add("*");
- rule = rule.Substring(m.Groups[0].Value.Length);
- }
- m = rComma.Match(rule);
- if (m.Success)
- {
- parts.Add(" | ");
- parts.Add("//");
- parts.Add("*");
- index = parts.Count - 1;
- rule = rule.Substring(m.Groups[0].Value.Length);
- }
- }
- string xpath = string.Join("", parts.ToArray());
- return xpath;
- }
Anyhow, hopefully this will help someone stumbling around teh intartubes looking for a solution to this.