hcxselect  1.1
A CSS selector engine for htmlcxx
hcxselect.cpp
00001 /*
00002  * hcxselect - A CSS selector engine for htmlcxx
00003  * Copyright (C) 2011 Jonas Gehring
00004  * All rights reserved.
00005  * 
00006  * Redistribution and use in source and binary forms, with or without
00007  * modification, are permitted provided that the following conditions are met:
00008  *     * Redistributions of source code must retain the above copyright
00009  *       notice, this list of conditions and the following disclaimer.
00010  *     * Redistributions in binary form must reproduce the above copyright
00011  *       notice, this list of conditions and the following disclaimer in the
00012  *       documentation and/or other materials provided with the distribution.
00013  *     * Neither the name of the copyright holders nor the
00014  *       names of its contributors may be used to endorse or promote products
00015  *       derived from this software without specific prior written permission.
00016  * 
00017  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00018  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00020  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
00021  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00022  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00023  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00024  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00025  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00026  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00027  * POSSIBILITY OF SUCH DAMAGE.
00028  */
00029 
00030 
00031 #include <stack>
00032 #include <sstream>
00033 #include <vector>
00034 
00035 #include "hcxselect.h"
00036 
00037 extern "C" {
00038         #include "lexer.h"
00039 }
00040 
00041 //#define TRACE printf("%s: ", __FUNCTION__); printf
00042 inline void TRACE(...) { }
00043 
00044 #define ENSURE(c, s) \
00045         if (!(c)) { throw ParseException(l->pos, s); }
00046 
00047 
00048 namespace hcxselect
00049 {
00050 
00051 // Anonymous namespace for local helpers
00052 namespace
00053 {
00054 
00055 typedef htmlcxx::HTML::Node HTMLNode;
00056 
00057 // Trims whitespace from the beginning and end of a string
00058 std::string trim(const std::string &str)
00059 {
00060         int start = 0;
00061         int end = str.length()-1;
00062         const char *data = str.c_str();
00063 
00064         while (start <= end && isspace(data[start])) ++start;
00065         while (end >= start && isspace(data[end])) --end;
00066 
00067         if (start > 0 || end < (int)str.length()) {
00068                 return str.substr(start, (end - start + 1));
00069         }
00070         return str;
00071 }
00072 
00073 // Wrapper for strcasecmp
00074 inline int strcasecmp(const std::string &s1, const std::string &s2)
00075 {
00076         return ::strcasecmp(s1.c_str(), s2.c_str());
00077 }
00078 
00079 // Checks for string prefix
00080 inline bool starts_with(const std::string &str, const std::string &start)
00081 {
00082         return !::strncasecmp(str.c_str(), start.c_str(), start.length());
00083 }
00084 
00085 // Checks for string suffix
00086 inline bool ends_with(const std::string &str, const std::string &end)
00087 {
00088         if (str.length() >= end.length()) {
00089                 return !::strcasecmp(str.c_str() + str.length() - end.length(), end.c_str());
00090         }
00091         return false;
00092 }
00093 
00094 // Delete all elements of a container
00095 template<typename T>
00096 inline void delete_all(const T &v)
00097 {
00098         for (typename T::const_iterator it = v.begin(); it != v.end(); ++it) {
00099                 delete *it;
00100         }
00101 }
00102 
00103 // String to number
00104 template <class T> inline bool stoi(T *t, const std::string& s)
00105 {
00106         std::istringstream iss(s);
00107         return !(iss >> *t).fail();
00108 }
00109 
00110 // Lexer for CSS selector grammar (wrapper for reentrant FLEX parser)
00111 class Lexer
00112 {
00113 public:
00114         Lexer(const std::string &str)
00115                 : pos(0), spos(0)
00116         {
00117                 yylex_init(&yy);
00118                 yy_scan_string(str.c_str(), yy);
00119         }
00120 
00121         ~Lexer()
00122         {
00123                 yylex_destroy(yy);
00124         }
00125 
00126         void unescape(std::string *str)
00127         {
00128                 size_t pos = 0;
00129                 while ((pos = str->find('\\', pos)) != std::string::npos) {
00130                         if ((*str)[pos+1] == '\\') {
00131                                 str->replace(pos, 2, "");
00132                         } else if (isdigit((*str)[pos+1])) {
00133                                 // Un-escape UTF-8 codes, sometimes followed by space
00134                                 size_t len = 1;
00135                                 while (isdigit((*str)[pos+len])) ++len;
00136                                 while (isspace((*str)[pos+len])) ++len;
00137                                 if (len > 1) {
00138                                         unsigned int x;
00139                                         std::stringstream ss;
00140                                         ss << std::hex << str->substr(pos+1, len-1);
00141                                         ss >> x;
00142                                         str->replace(pos, len, 1, x);
00143                                 }
00144                         } else {
00145                                 str->replace(pos, 1, "");
00146                         }
00147                 }
00148         }
00149 
00150         inline int lex(std::string *text)
00151         {
00152                 int token = yylex(yy);
00153                 spos += yyget_leng(yy);
00154                 if (token > 0) {
00155                         *text = yyget_text(yy);
00156                         pos = spos + 1 - text->length();
00157                 } else {
00158                         pos = spos;
00159                 }
00160                 if (token == IDENT) {
00161                         unescape(text);
00162                 }
00163                 return token;
00164         }
00165 
00166         yyscan_t yy;
00167         int pos, spos;
00168 };
00169 
00170 namespace Selectors
00171 {
00172 
00173 // Abstract base class for selector functions
00174 struct SelectorFn
00175 {
00176         typedef tree<HTMLNode>::iterator NodeIt;
00177 
00178         virtual ~SelectorFn() { }
00179         virtual bool match(const NodeIt &it) const = 0;
00180 
00181 protected:
00182         inline static bool hasParent(const NodeIt &it) {
00183                 return (it.node->parent == NULL || strcasecmp(it->tagName(), "html"));
00184         }
00185 };
00186 
00187 // Universal selector (*)
00188 struct Universal : SelectorFn
00189 {
00190         bool match(const NodeIt &it) const
00191         {
00192                 return it->isTag();
00193         }
00194 };
00195 
00196 // Type selector (E)
00197 struct Type : SelectorFn
00198 {
00199         Type(const std::string &type) : type(type) { }
00200 
00201         bool match(const NodeIt &it) const
00202         {
00203                 return (it->isTag() && !strcasecmp(it->tagName(), type));
00204         }
00205 
00206         std::string type;
00207 };
00208 
00209 // Attribute selector (E[foo])
00210 struct Attribute : SelectorFn
00211 {
00212         Attribute(const std::string &attr) : attr(attr) { }
00213 
00214         bool match(const NodeIt &it) const
00215         {
00216                 if (it->attributes().empty()) it->parseAttributes();
00217                 return it->attribute(attr).first;
00218         }
00219 
00220         std::string attr;
00221 };
00222 
00223 // Attribute value, with optional comparison operator (E[foo=bar])
00224 struct AttributeValue : SelectorFn
00225 {
00226         AttributeValue(const std::string &attr, const std::string &value, char c = '=')
00227                 : attr(attr), value(value), c(c) { }
00228 
00229         bool match(const NodeIt &it) const
00230         {
00231                 if (value.empty() && c != '=') return false;
00232 
00233                 if (it->attributes().empty()) it->parseAttributes();
00234                 const std::string &str = it->attribute(attr).second;
00235                 switch (c) {
00236                         case '=': return !strcasecmp(str, value);
00237                         case '^': return starts_with(str, value);
00238                         case '$': return ends_with(str, value);
00239                         case '*': return (str.find(value) != std::string::npos);
00240                         case '|': return !(strcasecmp(str, value) && !starts_with(str, value + "-"));
00241                         case '~': {
00242                                 // Split string by space and compare every part
00243                                 const char *ptr = str.c_str(), *last = str.c_str();
00244                                 const char *end = str.c_str() + str.length();
00245                                 int l = value.length();
00246                                 while (ptr < end) {
00247                                         while (*ptr && !isspace(*ptr)) ++ptr;
00248                                         if ((ptr - last) == l && !::strncasecmp(value.c_str(), last, l)) {
00249                                                 return true;
00250                                         }
00251                                         while (*ptr && isspace(*ptr)) ++ptr;
00252                                         last = ptr;
00253                                 }
00254                                 return false;
00255                         }
00256                         default: break;
00257                 }
00258                 return true;
00259         }
00260 
00261         std::string attr;
00262         std::string value;
00263         char c;
00264 };
00265 
00266 // Pseudo class or element
00267 struct Pseudo : SelectorFn
00268 {
00269         Pseudo(const std::string &type, int an = 0, int b = 0)
00270                 : type(type), an(an), b(b) { }
00271 
00272         bool checkNum(int i) const 
00273         {
00274                 if (an == 0) {
00275                         return (i == b);
00276                 }
00277                 return (((i - b) % an) == 0);
00278         }
00279 
00280         bool matchs(const NodeIt &it, const std::string &type) const
00281         {
00282                 if (type == "root") {
00283                         return !hasParent(it);
00284                 } else if (type == "first-child") {
00285                         if (!hasParent(it)) return false;
00286                         NodeIt jt(it.node->parent->first_child);
00287                         while (jt.node && !jt->isTag()) {
00288                                 jt = jt.node->next_sibling;
00289                         }
00290                         return (jt.node == it.node);
00291                 } else if (type == "last-child") {
00292                         if (!hasParent(it)) return false;
00293                         NodeIt jt(it.node->parent->last_child);
00294                         while (jt.node && !jt->isTag()) {
00295                                 jt = jt.node->prev_sibling;
00296                         }
00297                         return (jt.node == it.node);
00298                 } else if (type == "first-of-type") {
00299                         if (!hasParent(it)) return false;
00300                         NodeIt jt(it.node->parent->first_child);
00301                         while (jt.node && (!jt->isTag() || strcasecmp(jt->tagName(), it->tagName()))) {
00302                                 jt = jt.node->next_sibling;
00303                         }
00304                         return (jt.node == it.node);
00305                 } else if (type == "last-of-type") {
00306                         if (!hasParent(it)) return false;
00307                         NodeIt jt(it.node->parent->last_child);
00308                         while (jt.node && (!jt->isTag() || strcasecmp(jt->tagName(), it->tagName()))) {
00309                                 jt = jt.node->prev_sibling;
00310                         }
00311                         return (jt.node == it.node);
00312                 } else if (type == "empty") {
00313                         if (it->isTag()) {
00314                                 return (it.node->first_child == NULL ||
00315                                                 (it.node->first_child->data.isComment() && it.node->first_child == it.node->last_child));
00316                         }
00317                         return (it->isComment() || it->length() == 0);
00318                 } else if (type == "nth-child") {
00319                         if (!hasParent(it)) return false;
00320                         int i = 1;
00321                         NodeIt jt(it.node->parent->first_child);
00322                         while (jt.node && it.node != jt.node) {
00323                                 if (jt->isTag()) ++i;
00324                                 jt = jt.node->next_sibling;
00325                         }
00326                         return checkNum(i);
00327                 } else if (type == "nth-last-child") {
00328                         if (!hasParent(it)) return false;
00329                         int i = 1;
00330                         NodeIt jt(it.node->parent->last_child);
00331                         while (jt.node && it.node != jt.node) {
00332                                 if (jt->isTag()) ++i;
00333                                 jt = jt.node->prev_sibling;
00334                         }
00335                         return checkNum(i);
00336                 } else if (type == "nth-of-type") {
00337                         if (!hasParent(it)) return false;
00338                         int i = 1;
00339                         NodeIt jt(it.node->parent->first_child);
00340                         while (jt.node && it.node != jt.node) {
00341                                 if (jt->isTag() && !strcasecmp(jt->tagName(), it->tagName())) ++i;
00342                                 jt = jt.node->next_sibling;
00343                         }
00344                         return checkNum(i);
00345                 } else if (type == "nth-last-of-type") {
00346                         if (!hasParent(it)) return false;
00347                         int i = 1;
00348                         NodeIt jt(it.node->parent->last_child);
00349                         while (jt.node && it.node != jt.node) {
00350                                 if (jt->isTag() && !strcasecmp(jt->tagName(), it->tagName())) ++i;
00351                                 jt = jt.node->prev_sibling;
00352                         }
00353                         return checkNum(i);
00354                 } else if (type == "text") {
00355                         return (!it->isTag() && !it->isComment());
00356                 } else if (type == "comment") {
00357                         return it->isComment();
00358                 }
00359                 return false;
00360         }
00361 
00362         bool match(const NodeIt &it) const
00363         {
00364                 if (type == "only-child") {
00365                         return matchs(it, "first-child") && matchs(it, "last-child");
00366                 } else if (type == "only-of-type") {
00367                         return matchs(it, "first-of-type") && matchs(it, "last-of-type");
00368                 }
00369                 return matchs(it, type);
00370         }
00371 
00372         std::string type;
00373         int an, b;
00374 };
00375 
00376 // Negation (:not)
00377 struct Negation : SelectorFn
00378 {
00379         Negation(SelectorFn *fn) : fn(fn) { }
00380         ~Negation() { delete fn; }
00381 
00382         bool match(const NodeIt &it) const
00383         {
00384                 return !fn->match(it);
00385         }
00386 
00387         SelectorFn *fn;
00388 };
00389 
00390 // A simple selector sequence
00391 struct SimpleSequence : SelectorFn
00392 {
00393         SimpleSequence(const std::vector<SelectorFn *> &fns) : fns(fns) { }
00394         ~SimpleSequence() { delete_all(fns); }
00395 
00396         bool match(const NodeIt &it) const
00397         {
00398                 std::vector<SelectorFn *>::const_iterator ft(fns.begin());
00399                 std::vector<SelectorFn *>::const_iterator end(fns.end());
00400                 while (ft != end && (*ft)->match(it)) {
00401                         ++ft;
00402                 }
00403                 return (ft == end);
00404         }
00405 
00406         std::vector<SelectorFn *> fns;
00407 };
00408 
00409 // Combinator ( , >, ~, +)
00410 struct Combinator : SelectorFn
00411 {
00412         Combinator(SelectorFn *left, SelectorFn *right, char c) : left(left), right(right), c(c) { }
00413         ~Combinator() { delete left; delete right; }
00414 
00415         bool match(const NodeIt &it) const
00416         {
00417                 // First, check if the node matches the right side of the combinator
00418                 if (!right->match(it)) {
00419                         return false;
00420                 }
00421 
00422                 // Check all suitable neighbor nodes using the left selector
00423                 NodeIt jt;
00424                 switch (c) {
00425                         case ' ': // Descendant
00426                         case '*': // Greatchild or further descendant
00427                                 if (!hasParent(it)) return false;
00428                                 jt = it.node->parent;
00429                                 if (c == '*' && jt.node) {
00430                                         jt = jt.node->parent;
00431                                 }
00432                                 while (jt.node) {
00433                                         if (left->match(jt)) {
00434                                                 return true;
00435                                         }
00436                                         jt = jt.node->parent;
00437                                 }
00438                                 return false;
00439 
00440                         case '>': // Child
00441                                 if (!hasParent(it)) return false;
00442                                 jt = it.node->parent;
00443                                 return jt.node && left->match(jt);
00444 
00445                         case '+': // Adjacent sibling
00446                                 if (!hasParent(it)) return false;
00447                                 jt = it.node->prev_sibling;
00448                                 while (jt.node && !jt->isTag()) {
00449                                         jt = jt.node->prev_sibling;
00450                                 }
00451                                 return jt.node && left->match(jt);
00452 
00453                         case '~': // General sibling
00454                                 if (!hasParent(it)) return false;
00455                                 jt = it.node->prev_sibling;
00456                                 while (jt.node) {
00457                                         if (jt->isTag() && left->match(jt)) {
00458                                                 return true;
00459                                         }
00460                                         jt = jt.node->prev_sibling;
00461                                 }
00462                                 return false;
00463 
00464                         default: break;
00465                 }
00466 
00467                 return false;
00468         }
00469 
00470         SelectorFn *left, *right;
00471         char c;
00472 };
00473 
00474 } // namespace Selectors
00475 
00476 using Selectors::SelectorFn;
00477 
00478 SelectorFn *parseSelector(Lexer *l, int &token, std::string &s);
00479 
00480 // Tries to parse a simple selector
00481 SelectorFn *parseSimpleSequence(Lexer *l, int &token, std::string &s)
00482 {
00483         std::vector<SelectorFn *> fns;
00484 
00485         // [ type_selector | universal ]
00486         TRACE("%d: %s\n", token, s.c_str());
00487         if (token == IDENT) {
00488                 fns.push_back(new Selectors::Type(s));
00489                 token = l->lex(&s);
00490         } else if (token == '*') {
00491                 fns.push_back(new Selectors::Universal());
00492                 token = l->lex(&s);
00493         }
00494 
00495         // [ HASH | class | attrib | pseudo | negation ]*
00496         bool lex = true;
00497         while (token) {
00498                 switch (token) {
00499                 case HASH:
00500                         fns.push_back(new Selectors::AttributeValue("id", s.substr(1)));
00501                         break;
00502                 case '.':
00503                         token = l->lex(&s);
00504                         ENSURE(token == IDENT, "Identifier expected");
00505                         TRACE("%d: %s\n", token, s.c_str());
00506                         fns.push_back(new Selectors::AttributeValue("class", s, '~'));
00507                         break;
00508                 case '[': {
00509                         token = l->lex(&s);
00510                         if (token == S) token = l->lex(&s);
00511                         ENSURE(token == IDENT, "Identifier expected");
00512                         std::string a = s;
00513 
00514                         token = l->lex(&s);
00515                         if (token == S) token = l->lex(&s);
00516                         if (token == ']') {
00517                                 fns.push_back(new Selectors::Attribute(a));
00518                                 break;
00519                         }
00520 
00521                         int c = 0;
00522                         switch (token) {
00523                                 case INCLUDES: c = '~'; break;
00524                                 case DASHMATCH: c = '|'; break;
00525                                 case PREFIXMATCH: c = '^'; break;
00526                                 case SUFFIXMATCH: c = '$'; break;
00527                                 case SUBSTRINGMATCH: c = '*'; break;
00528                                 case '=': c = '='; break;
00529                                 default: throw ParseException(l->pos, "Invalid character");
00530                         }
00531                         TRACE("got %d, %c\n", token, token);
00532 
00533                         token = l->lex(&s);
00534                         if (token == S) token = l->lex(&s);
00535                         ENSURE(token == STRING || token == IDENT, "Token is neither string nor identifier"); 
00536                         std::string v = (token == STRING ? s.substr(1, s.length()-2) : s);
00537 
00538                         fns.push_back(new Selectors::AttributeValue(a, v, c));
00539                         token = l->lex(&s);
00540                         if (token == S) token = l->lex(&s);
00541                         ENSURE(token == ']', "']' expected");
00542                         break;
00543                 }
00544                 case ':': {
00545                         token = l->lex(&s);
00546                         if (token == ':') { token = l->lex(&s); s.insert(0, ":"); }
00547                         if (token == IDENT) {
00548                                 fns.push_back(new Selectors::Pseudo(s));
00549                         } else if (token == FUNCTION) {
00550                                 std::string f(s.substr(0, s.length()-1));
00551                                 int an = 0, b = 0;
00552                                 token = l->lex(&s);
00553                                 if (token == S) token = l->lex(&s);
00554                                 if (token == IDENT) {
00555                                         if (s == "odd" || s == "even") {
00556                                                 an = 2; b = (s == "odd" ? 1 : 0);
00557                                                 token = l->lex(&s);
00558                                                 ENSURE(token == ')', "')' expected");
00559                                         } else if (s == "n" || s == "-n") {
00560                                                 an = (s == "n" ? 1 : -1);
00561                                                 token = l->lex(&s);
00562                                                 if (token == PLUS) {
00563                                                         token = l->lex(&s);
00564                                                         if (token == S) token = l->lex(&s);
00565                                                         ENSURE(token == NUMBER && stoi(&b, s), "Number expected");
00566                                                         token = l->lex(&s);
00567                                                         ENSURE(token == ')', "')' expected");
00568                                                 } else if (token != ')') {
00569                                                         throw ParseException(l->pos, "')' expected");
00570                                                 }
00571                                         } else {
00572                                                 throw ParseException(l->pos, "odd/even/n expected");
00573                                         }
00574                                 } else if (token == DIMENSION) {
00575                                         std::istringstream ss(s);
00576                                         ss >> an >> s;
00577                                         ENSURE(!ss.fail() && s == "n", "Expression of form 'an' expected");
00578                                         token = l->lex(&s);
00579                                         if (token == PLUS) {
00580                                                 token = l->lex(&s);
00581                                                 if (token == S) token = l->lex(&s);
00582                                                 ENSURE(token == NUMBER && stoi(&b, s), "Number expected");
00583                                                 token = l->lex(&s);
00584                                                 ENSURE(token == ')', "')' expected");
00585                                         }
00586                                 } else if (token == NUMBER) {
00587                                         ENSURE(stoi(&b, s), "Number expected");
00588                                         token = l->lex(&s);
00589                                         ENSURE(token == ')', "')' expected");
00590                                 } else {
00591                                         throw ParseException(l->pos, "Invalid expression");
00592                                 }
00593                                 TRACE("pseudo %s,with %dn + %d\n", f.c_str(), an, b);
00594                                 fns.push_back(new Selectors::Pseudo(f, an, b));
00595                         } else {
00596                                 throw ParseException(l->pos, "Identifier or funtion expected");
00597                         }
00598                         break;
00599                 }
00600                 case NOT: {
00601                         token = l->lex(&s);
00602                         fns.push_back(new Selectors::Negation(parseSelector(l, token, s)));
00603                         ENSURE(token == ')', "')' expected");
00604                         break;
00605                 }
00606                 case ')': // For negations
00607                 default: goto finish;
00608                 }
00609 
00610                 if (lex) {
00611                         token = l->lex(&s);
00612                 }
00613                 lex = true;
00614         }
00615 
00616 finish:
00617         return new Selectors::SimpleSequence(fns);
00618 }
00619 
00620 // Recursive parsing function
00621 SelectorFn *parseSelector(Lexer *l, int &token, std::string &s)
00622 {
00623         if (token == S) token = l->lex(&s);
00624         SelectorFn *fn = parseSimpleSequence(l, token, s);
00625 
00626         while (token) {
00627                 TRACE("%d: %s\n", token, s.c_str());
00628                 bool space = false;
00629                 if (token == S) {
00630                         space = true;
00631                         token = l->lex(&s);
00632                 }
00633                 TRACE("%d: %s\n", token, s.c_str());
00634 
00635                 char c = -1;
00636                 switch (token) {
00637                         case S: c = ' '; break;
00638                         case PLUS: c = '+'; break;
00639                         case GREATER: c = '>'; break;
00640                         case TILDE: c = '~'; break;
00641                         case '*': c = '*'; break;
00642 
00643                         case 0: return fn;
00644                         default:
00645                                 if (space) {
00646                                         c = ' ';
00647                                 } else {
00648                                         return fn;
00649                                 }
00650                                 break;
00651                 }
00652 
00653                 if (c != ' ') {
00654                         token = l->lex(&s);
00655                         TRACE("%d: %s\n", token, s.c_str());
00656                         if (token == S) token = l->lex(&s);
00657                 }
00658                 TRACE("%d: %s\n", token, s.c_str());
00659                 SelectorFn *fn2 = parseSimpleSequence(l, token, s);
00660                 fn = new Selectors::Combinator(fn, fn2, c);
00661         }
00662 
00663         return fn;
00664 }
00665 
00666 // Parses a CSS selector expression and returns a set of functions
00667 std::vector<SelectorFn *> parse(const std::string &expr)
00668 {
00669         std::vector<SelectorFn *> fns;
00670         int token;
00671         std::string s;
00672 
00673         Lexer l(trim(expr));
00674         while ((token = l.lex(&s))) {
00675                 fns.push_back(parseSelector(&l, token, s));
00676                 if (token != COMMA && token != 0) {
00677                         throw ParseException(l.pos, "Comma expected");
00678                 }
00679         }
00680 
00681         return fns;
00682 }
00683 
00684 // Matches a set of nodes against a selector
00685 NodeSet match(const NodeSet &nodes, const SelectorFn *fn)
00686 {
00687         std::stack<tree<HTMLNode>::iterator> stack;
00688         for (NodeSet::const_iterator it(nodes.begin()); it != nodes.end(); ++it) {
00689                 stack.push(tree<HTMLNode>::iterator(*it));
00690         }
00691 
00692         // Depth-first traversal using a stack
00693         NodeSet result;
00694         while (!stack.empty()) {
00695                 tree<HTMLNode>::iterator it = stack.top();
00696                 stack.pop();
00697 
00698                 // Check if selector matches
00699                 if (fn->match(it)) {
00700                         result.insert(it.node);
00701                 }
00702 
00703                 // Inspect all child nodes of non-matching elements
00704                 tree<HTMLNode>::sibling_iterator jt;
00705                 for (jt = it.begin(); jt != it.end(); ++jt) {
00706                         stack.push(jt);
00707                 }
00708         }
00709 
00710         return result;
00711 }
00712 
00713 } // Anonymous namespace
00714 
00715 
00719 bool NodeComp::operator()(Node *a, Node *b) const {
00720         if (a == NULL || b == NULL) {
00721                 return a < b;
00722         }
00723         return a->data.offset() < b->data.offset();
00724 }
00725 
00726 // Applies a CSS selector expression to a document tree.
00727 NodeSet select(const tree<HTMLNode> &tree, const std::string &expr)
00728 {
00729         // Select the <html> node from the tree and use it as the root node
00730         NodeSet v;
00731         ::tree<HTMLNode>::iterator it;
00732         for (it = tree.begin(); it != tree.end(); ++it) {
00733                 if (!strcasecmp(it->tagName(), "html")) {
00734                         v.insert(it.node);
00735                         break;
00736                 }
00737         }
00738 
00739         if (expr.empty()) {
00740                 return v;
00741         }
00742         return select(v, expr);
00743 }
00744 
00745 // Applies a CSS selector expression to a set of nodes.
00746 NodeSet select(const NodeSet &nodes, const std::string &expr)
00747 {
00748         // Parse expression
00749         std::vector<SelectorFn *> fns = parse(expr);
00750 
00751         NodeSet result;
00752         std::vector<SelectorFn *>::const_iterator it;
00753         for (it = fns.begin(); it != fns.end(); ++it) {
00754                 NodeSet v = match(nodes, *it);
00755                 result.insert(v.begin(), v.end());
00756         }
00757 
00758         delete_all(fns);
00759         return result;
00760 }
00761 
00762 
00766 Selection::Selection()
00767 {
00768 }
00769 
00774 Selection::Selection(const tree<htmlcxx::HTML::Node> &tree, const std::string &expr)
00775 {
00776         NodeSet v = hcxselect::select(tree, expr);
00777         insert(v.begin(), v.end());
00778 }
00779 
00784 Selection::Selection(const NodeSet &nodes, const std::string &expr)
00785 {
00786         if (!expr.empty()) {
00787                 NodeSet v = hcxselect::select(nodes, expr);
00788                 insert(v.begin(), v.end());
00789         } else {
00790                 insert(nodes.begin(), nodes.end());
00791         }
00792 }
00793 
00798 Selection Selection::select(const std::string &expr)
00799 {
00800         return hcxselect::select(*this, expr);
00801 }
00802 
00803 } // namespace hcxselect