技术开发 频道

基本的HTML文本解析器的设计和实现

  下面是负责解析“开始标签”属性表文本(形如“key1=value1 key2=value2 key3”)的代码,parseNodeProps(),核心思路是按空格和等号字符进行分割属性名和属性值,由于想兼容HTML4.01及以前的不标准的属性表写法(如没有=号也没有属性值),颇费周折:

view plaincopy to clipboardprint?
//[virtual]  
void HtmlParser::parseNodeProps(HtmlNode* pNode)  
{  
    
if(pNode == NULL || pNode->propCount > 0 || pNode->text == NULL)  
        
return;  
    WCHAR
* p = pNode->text;  
    WCHAR
*ps = NULL;  
    CMem mem;  
    
bool inQuote1 = false, inQuote2 = false;  
    WCHAR c;  
    
while(c = *p)  
    
{  
        
if(c == L'\"')  
        
{  
            inQuote1
= !inQuote1;  
        }
  
        
else if(c == L'\'')  
        
{  
            inQuote2
= !inQuote2;  
        }
  
        
if((!inQuote1 && !inQuote2) && (c == L' ' || c == L'\t' || c == L'='))  
        
{  
            
if(ps)  
            
{  
                mem.AddPointer(duplicateStrAndUnquote(ps, p
- ps));  
                ps
= NULL;  
            }
  
            
if(c == L'=')  
                mem.AddPointer(NULL);  
        }
  
        
else  
        
{  
            
if(ps == NULL)  
                ps
= p;  
        }
  
        p
++;  
    }
  
    
if(ps)  
        mem.AddPointer(duplicateStrAndUnquote(ps, p
- ps));  
    mem.AddPointer(NULL);  
    mem.AddPointer(NULL);  
    WCHAR
** pp = (WCHAR**) mem.GetPtr();  
    CMem props;  
    
for(int i = 0, n = mem.GetSize() / sizeof(WCHAR*) - 2; i < n; i++)  
    
{  
        props.AddPointer(pp[i]);
//prop name  
        if(pp[i+1] == NULL)  
        
{  
            props.AddPointer(pp[i
+2]); //prop value  
            i += 2;  
        }
  
        
else  
            props.AddPointer(NULL);
//prop vlalue  
    }
  
    pNode
->propCount = props.GetSize() / sizeof(WCHAR*) / 2;  
    pNode
->props = (HtmlNodeProp*) props.Detach();  
}

  根据标签名称取标签类型的getHtmlTagTypeFromName()方法,就非常直白了,查表,逐一识别:

view plaincopy to clipboardprint?
//[virtual]  
HtmlTagType HtmlParser::getHtmlTagTypeFromName(const WCHAR* szTagName)  
{  
    
//todo: uses hashmap  
    struct N2T { const WCHAR* name; HtmlTagType type; };  
    
static N2T n2tTable[] =    
    
{  
        
{ L"A", TAG_A },  
        
{ L"FONT", TAG_FONT },  
        
{ L"IMG", TAG_IMG },  
        
{ L"P", TAG_P },  
        
{ L"DIV", TAG_DIV },  
        
{ L"SPAN", TAG_SPAN },  
        
{ L"BR", TAG_BR },  
        
{ L"B", TAG_B },  
        
{ L"I", TAG_I },  
        
{ L"HR", TAG_HR },  
    }
;  
    
for(int i = 0, count = sizeof(n2tTable)/sizeof(n2tTable[0]); i < count; i++)  
    
{  
        N2T
* p = &n2tTable[i];  
        
if(wcsicmp(p->name, szTagName) == 0)  
            
return p->type;  
    }
  
    
return TAG_UNKNOWN;  
}
0
相关文章