加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
gumbowrap.c 12.60 KB
一键复制 编辑 原始数据 按行查看 历史
hotmocha 提交于 2015-04-02 22:29 . spider init
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
#include "gumbowrap.h"
#include <stdio.h>
#include "ilog.h"
#include "simplebuf.h"
int htmldebug = 0;
const char *invalid = "INVALID";
const HtmlPathNode ENDNODE = __HTMLPATHNODEEND__;
/* >=0 success, <0 failed */
int GetTag(GumboNode *n)
{
if (n == NULL)
return GUMBO_ARGNULL;
return n->v.element.tag;
}
/* >=0 success, <0 failed */
int GetLen(GumboNode *n)
{
if (n == NULL)
return GUMBO_ARGNULL;
return n->v.element.children.length;
}
int FindChildByIndex(GumboNode *n, unsigned int index, GumboNode **child)
{
unsigned int len;
if (n == NULL || child == NULL)
return GUMBO_ARGNULL;
len = GetLen(n);
if (index >= len || len < 0)
return GUMBO_PARAMETER;
*child = n->v.element.children.data[index];
return 0;
}
int FindChildByTag(GumboNode *n, int tag, GumboNode **child)
{
unsigned int len, i;
if (n == NULL || child == NULL || tag < 0 || tag > GUMBO_TAG_LAST)
return GUMBO_ARGNULL;
len = GetLen(n);
for (i = 0; i < len; i++) {
*child = n->v.element.children.data[i];
if (*child == NULL)
continue;
if (GetTag(*child) == tag) {
break;
}
}
if (i == len)
return GUMBO_NOTFOUND;
return 0;
}
int GetAttrByID(GumboNode *n, char *id, const char **value)
{
GumboAttribute *attr = NULL;
if (!n || !id || !value)
return GUMBO_ARGNULL;
attr = gumbo_get_attribute(&n->v.element.attributes, id);
if (!attr)
return GUMBO_NOTFOUND;
*value = attr->value;
return 0;
}
GumboAttribute* GetAttr(GumboNode *n, char *id)
{
GumboAttribute *attr = NULL;
if (!n || !id)
return NULL;
attr = gumbo_get_attribute(&n->v.element.attributes, id);
return attr;
}
int FindChildByTagAndAttr(GumboNode *n, int tag, char *id, char *value, GumboNode **child)
{ unsigned int len, i;
int ret = 0;
if (n == NULL || child == NULL || tag < 0 || tag > GUMBO_TAG_LAST)
return GUMBO_ARGNULL;
if (id == NULL || value == NULL)
return GUMBO_ARGNULL;
len = GetLen(n);
for (i = 0; i < len; i++) {
*child = n->v.element.children.data[i];
if (*child == NULL)
continue;
if (GetTag(*child) == tag ) {
const char *tempattrvalue = NULL;
ret = GetAttrByID(*child, id, &tempattrvalue);
if (ret) return ret;
if (strcmp(tempattrvalue, value) == 0) {
break;
}
}
}
if (i == len)
return GUMBO_NOTFOUND;
return 0;
}
/* >=0 success, <0 failed */
int GetNodeType(GumboNode *n)
{
if (n == NULL)
return GUMBO_ARGNULL;
return n->type;
}
int CheckAttr(GumboNode *root, HtmlPathNode *path)
{
int i = 0;
int ret = 0;
if (root == NULL || path == NULL)
return GUMBO_PARAMETER;
while(1) {
if (strlen(path->attrname[i]) == 0 || strcmp(path->attrname[i], "") == 0) {
break;
}
char *id = path->attrname[i];
char *attrvalue = path->attrvalue[i];
const char *start = NULL;
const char *end = NULL;
const char *tempattrvalue = NULL;
ret = GetAttrByID(root, id, &tempattrvalue);
if (ret) {
return ret;
}
start = tempattrvalue;
end = tempattrvalue + strlen(tempattrvalue) - 1;
while (*start == ' ' || *start == '\t' || *start == '\r' || *start == '\n')
start++;
while (*end == ' ' || *end == '\t' || *end == '\r' || *end == '\n')
end--;
if (start > end) {
return GUMBO_ATTRERROR;
}
if (strncmp(start, attrvalue, end - start + 1) == 0)
return 0;
else
return GUMBO_ATTRERROR;
i++;
}
return 0;
}
/* 本函数只精确的路径查找,比如一个标签下有两个span,如果路径只规定TAG=span,则本函数
只检查第一个满足条件的node进行递归 */
int WalkHtmlPathExact(GumboNode *root, HtmlPathNode *path, GumboNode **node)
{
#if DEBUG
printf("=================================================\n");
#endif
HtmlPathNode *next = NULL;
unsigned int len, i;
int validnode = 0;
int factindex = 0;
int ret = 0;
if (root == NULL || path == NULL || node == NULL)
return GUMBO_NOTFOUND;
#if DEBUG
if (path->tag != UNDEFINED)
printf("path->tag[%d],tagname[%s]\n", path->tag, gumbo_normalized_tagname(path->tag));
if (path->index != UNDEFINED)
printf("path->index[%d]\n", path->index);
i = 0;
while(1) {
if (strlen(path->attrname[i]) == 0 || strcmp(path->attrname[i], "") == 0) {
break;
}
printf("name[%s]-value[%s]\n", path->attrname[i], path->attrvalue[i]);
i++;
}
#endif
next = path + 1;
/* 如果没有定义只的是第一个 */
if (path->index == UNDEFINED)
factindex = 1;
len = GetLen(root);
for (i = 0; i < len; i++) {
GumboNode *child = NULL;
ret = FindChildByIndex(root, i, &child);
if (ret) {
return GUMBO_NOTFOUND;
}
/* 如果path中的type为text,一般都是最后一个可以不检查tag类型 */
if ( ( path->type != GUMBO_NODE_TEXT &&
path->type != GUMBO_NODE_WHITESPACE &&
path->type != GUMBO_NODE_CDATA &&
path->type != GUMBO_NODE_DOCUMENT
)
&&
( GetTag(child) <= 0 ||
GetTag(child) >= GUMBO_TAG_LAST
)
)
{
continue;
}
#if DEBUG
printf("++++child[%d]start+++\n", i);
if (child->type == GUMBO_NODE_ELEMENT) {
printf("child[%d]->tag[%d]-[%s]\n", i, GetTag(child),gumbo_normalized_tagname(GetTag(child)));
GumboVector *vec = NULL;
vec = &child->v.element.attributes;
if (vec) {
int attrlen = vec->length;
printf("attr cnt[%d]\n", attrlen);
int j = 0;
for (j = 0; j < attrlen; j++) {
const char *aa = NULL;
GumboAttribute *attr = (GumboAttribute*)vec->data[j];
printf("Attrname[%s]-Attrvalue[%s]\n", attr->name, attr->value);
}
}
}
else if (child->type == GUMBO_NODE_TEXT) {
printf("TEXT[%s]\n", child->v.text.text);
}
printf("----child[%d]end----\n", i);
#endif
/* 检查tag */
int temptag = GetTag(child);
if (path->tag != UNDEFINED) {
if (temptag != path->tag)
continue;
else
validnode++;
}
else
validnode++;
/* 检查index */
if (path->index == UNDEFINED || validnode == path->index) {
/* 检查type */
if (path->type != UNDEFINED) {
if (GetNodeType(child) != path->type) {
return GUMBO_NOTFOUND;
}
}
/* 检查attr */
if (CheckAttr(child, path)) {
/* index 如果是 UNDEFINED 可以不对 */
continue;
}
/* pass all check */
/* 最后一个节点 */
if (memcmp(next, &ENDNODE, sizeof(HtmlPathNode)) == 0) {
*node = child;
return 0;
}
else {
return WalkHtmlPath(child, next, node);
}
}
}
return GUMBO_NOTFOUND;
}
int WalkHtmlPath(GumboNode *root, HtmlPathNode *path, GumboNode **node)
{
if (htmldebug)
printf("=================================================\n");
HtmlPathNode *next = NULL;
unsigned int len, i;
int validnode = 0;
int factindex = 0;
int ret = 0;
if (root == NULL || path == NULL || node == NULL)
return GUMBO_NOTFOUND;
if (htmldebug) {
printf("path 信息+++++++++\n");
if (path->tag != UNDEFINED)
printf("path->tag[%d] tagname[%s] ", path->tag, gumbo_normalized_tagname(path->tag));
if (path->index != UNDEFINED)
printf("path->index[%d]", path->index);
i = 0;
printf("\n");
while(1) {
if (strlen(path->attrname[i]) == 0 || strcmp(path->attrname[i], "") == 0) {
break;
}
printf("name[%s]-value[%s]\n", path->attrname[i], path->attrvalue[i]);
i++;
}
printf("path 信息--------\n");
}
next = path + 1;
/* 如果没有定义只的是第一个 */
if (path->index == UNDEFINED)
factindex = 1;
len = GetLen(root);
for (i = 0; i < len; i++) {
GumboNode *child = NULL;
ret = FindChildByIndex(root, i, &child);
if (ret) {
return GUMBO_NOTFOUND;
}
if ( ( path->type != GUMBO_NODE_TEXT &&
path->type != GUMBO_NODE_WHITESPACE &&
path->type != GUMBO_NODE_CDATA &&
path->type != GUMBO_NODE_DOCUMENT
)
&&
( GetTag(child) <= 0 ||
GetTag(child) >= GUMBO_TAG_LAST
)
)
{
continue;
}
/* 接下去node一定是element或者document */
if (htmldebug) {
printf("++++子标签[%d]start+++\n", i);
if (child->type == GUMBO_NODE_ELEMENT) {
printf("child[%d]->tag[%d]-[%s]\n", i, GetTag(child),gumbo_normalized_tagname(GetTag(child)));
GumboVector *vec = NULL;
vec = &child->v.element.attributes;
if (vec) {
int attrlen = vec->length;
printf("attr cnt[%d]\n", attrlen);
int j = 0;
for (j = 0; j < attrlen; j++) {
const char *aa = NULL;
GumboAttribute *attr = (GumboAttribute*)vec->data[j];
printf("Attrname[%s]-Attrvalue[%s]\n", attr->name, attr->value);
}
}
}
else if (child->type == GUMBO_NODE_TEXT) {
printf("TEXT[%s]\n", child->v.text.text);
}
printf("----子标签[%d]end----\n", i);
}
/* 检查tag */
int temptag = GetTag(child);
if (path->tag != UNDEFINED) {
if (temptag != path->tag)
continue;
else
validnode++;
}
else
validnode++;
if (htmldebug)
printf("validnode[%d]-index[%d]\n", validnode, path->index);
if (path->index != UNDEFINED && path->index < validnode ) {
break;
}
/* 检查index */
if (path->index == UNDEFINED || validnode == path->index) {
/* 检查type */
if (path->type != UNDEFINED) {
if (GetNodeType(child) != path->type) {
return GUMBO_NOTFOUND;
}
}
if (htmldebug)
printf("okk");
/* 检查attr */
if (CheckAttr(child, path)) {
/* index 如果是 UNDEFINED 可以不对 */
continue;
}
if (htmldebug)
printf("okk1");
/* pass all check */
/* 最后一个节点 */
if (memcmp(next, &ENDNODE, sizeof(HtmlPathNode)) == 0) {
*node = child;
if (htmldebug)
printf("okk3");
return 0;
}
else {
/* 如果满足path节点,但是后面路径搜索不成功则会继续搜索下一个 */
if (WalkHtmlPath(child, next, node) == 0) {
return 0;
}
else {
continue;
}
}
if (htmldebug)
printf("okk2");
}
}
return GUMBO_NOTFOUND;
}
/* 获得一个属性为TEXT的text */
int HtmlGetTextByPath(GumboNode *root, HtmlPathNode *path, char *text, int len)
{
int ret = 0;
GumboNode *node = NULL;
if (root == NULL || path == NULL || len <= 0)
return GUMBO_ARGNULL;
ret = WalkHtmlPath(root, path, &node);
if (ret) {
return ret;
}
if (GetNodeType(node) != GUMBO_NODE_TEXT)
return GUMBO_PARSEERROR;
if (node == NULL)
text[0] = '\0';
int vlen = strlen(node->v.text.text);
if (vlen > len)
strncpy(text, node->v.text.text, len);
else {
strcpy(text, node->v.text.text);
text[vlen] = '\0';
}
return 0;
}
/* 从一个element中的attr属性中获得value */
int HtmlGetAttrValueByPath(GumboNode *root, HtmlPathNode *path, char *id, char *text, int len)
{
int ret = 0;
GumboNode *node = NULL;
if (root == NULL || path == NULL || len <= 0 || id == NULL)
return GUMBO_ARGNULL;
ret = WalkHtmlPath(root, path, &node);
if (ret) {
return ret;
}
if (node == NULL)
text[0] = '\0';
/* element 才有attr属性 */
if (GetNodeType(node) != GUMBO_NODE_ELEMENT)
return GUMBO_PARSEERROR;
const char *temptext = NULL;
ret = GetAttrByID(node, id, &temptext);
if (ret)
return ret;
int vlen = strlen(temptext);
if (vlen > len)
strncpy(text, temptext, len);
else {
strcpy(text, temptext);
text[vlen] = '\0';
}
return 0;
}
const char *TagEnumToName(GumboTag d)
{
if (d < 0 || d >= GUMBO_TAG_LAST)
return invalid;
return gumbo_normalized_tagname(d);
}
const char *TypeEnumToName(GumboNodeType d)
{
if (d < 0 || d > GUMBO_NODE_WHITESPACE)
return invalid;
switch(d) {
/** Document node. v will be a GumboDocument. */
case GUMBO_NODE_DOCUMENT:
return "DOCUMENT";
/** Element node. v will be a GumboElement. */
case GUMBO_NODE_ELEMENT:
return "ELEMENT";
/** Text node. v will be a GumboText. */
case GUMBO_NODE_TEXT:
return "TEXT";
/** CDATA node. v will be a GumboText. */
case GUMBO_NODE_CDATA:
return "CDATA";
/** Comment node. v. will be a GumboText, excluding comment delimiters. */
case GUMBO_NODE_COMMENT:
return "COMMENT";
/** Text node, where all contents is whitespace. v will be a GumboText. */
case GUMBO_NODE_WHITESPACE:
return "WHITESPACE";
default:
return invalid;
}
}
void GetAllAttrByTag(GumboNode *node, GumboTag tag, char* id, void (*func)(const char*))
{
GumboVector *children = NULL;
GumboNode *json = NULL;
GumboAttribute *attr = NULL;
int i;
/* 当前节点属性不为ELEMENT直接返回 */
if(node->type != GUMBO_NODE_ELEMENT)
return;
/* 获取当前节点class属性 */
if (node->v.element.tag == tag) {
/* 获得id属性 */
if((attr=gumbo_get_attribute(&node->v.element.attributes, id)) != NULL)
{
if (strcmp(attr->value, id) == 0) {
json=(GumboNode *)(&node->v.element.children)->data[0];
if(json && json->type == GUMBO_NODE_TEXT) {
/* 调用用户的函数 */
if (func)
func(json->v.text.text);
}
}
}
}
/* 当前节点子节点 */
children=&node->v.element.children;
/* 查找 */
for (i = 0; i < children->length; i++) {
GetAllAttrByTag(children->data[i], tag, id, func);
}
return;
}
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化