• 欢迎访问搞代码网站,推荐使用最新版火狐浏览器和Chrome浏览器访问本网站!
  • 如果您觉得本站非常有看点,那么赶紧使用Ctrl+D 收藏搞代码吧

C# 将 HTML 转成纯文本

c# 搞代码 4年前 (2022-01-09) 8次浏览 已收录 0个评论
/// <summary>/// Converts HTML to plain text./// </summary>class HtmlToText{    // Static data tables    protected static Dictionary _tags;    protected static HashSet _ignoreTags;     // Instance variables    protected TextBuilder _text;    protected string _html;    protected int _pos;     // Static constructor (one time only)    static HtmlToText()    {        _tags = new Dictionary();        _tags.Add("address", "\n");        _tags.Add("blockquote", "\n");        _tags.Add("div", "\n");        _tags.Add("dl", "\n");        _tags.Add("fieldset", "\n");        _tags.Add("form", "\n");        _tags.Add("h1", "\n");        _tags.Add("/h1", "\n");        _tags.Add("h2", "\n");        _tags.Add("/h2", "\n");        _tags.Add("h3", "\n");        _tags.Add("/h3", "\n");        _tags.Add("h4", "\n");        _tags.Add("/h4", "\n");        _tags.Add("h5", "\n");        _tags.Add("/h5", "\n");        _tags.Add("h6", "\n");        _tags.Add("/h6", "\n");        _tags.Add("p", "\n");        _tags.Add("/p", "\n");        _tags.Add("table", "\n");        _tags.Add("/table", "\n");        _tags.Add("ul", "\n");        _tags.Add("/ul", "\n");        _tags.Add("ol", "\n");        _tags.Add("/ol", "\n");        _tags.Add("/li", "\n");        _tags.Add("br", "\n");        _tags.Add("/td", "\t");        _tags.Add("/tr", "\n");        _tags.Add("/pre", "\n");         _ignoreTags = new HashSet();        _ignoreTags.Add("script");        _ignoreTags.Add("noscript");        _ignoreTags.Add("style");        _ignoreTags.Add("object");    }     /// <summary>    /// Converts the given HTML to plain text and returns the result.    /// </summary>    /// HTML to be converted    /// Resulting plain text    public string Convert(string html)    {        // Initialize state variables        _text = new TextBuilder();        _html = html;        _pos = 0;         // Process input        while (!EndOfText)        {            if (Peek() == '                    _text.Clear();                }                else if (tag == "/body")                {                    // Discard content after                     _pos = _html.Length;                }                else if (tag == "pre")                {                    // Enter preformatted mode                    _text.Preformatted = true;                    EatWhitespaceToNextLine();                }                else if (tag == "/pre")                {                    // Exit preformatted mode                    _text.Preformatted = false;                }                 string value;                if (_tags.TryGetValue(tag, out value))                    _text.Write(value);                 if (_ignoreTags.Contains(tag))                    EatInnerContent(tag);            }            else if (Char.IsWhiteSpace(Peek()))            {                // Whitespace (treat all as space)                _text.Write(_text.Preformatted ? Peek() : ' ');                MoveAhead();            }            else            {                // Other text                _text.Write(Peek());                MoveAhead();            }        }        // Return result        return HttpUtility.HtmlDecode(_text.ToString());    }     // Eats all characters that are part of the current tag    // and returns information about that tag    protected string ParseTag(out bool selfClosing)    {        string tag = String.Empty;        selfClosing = false;         if (Peek() == '')                MoveAhead();            tag = _html.Substring(start, _pos - start).ToLower();             // Parse rest of tag            while (!EndOfText && Peek() != '>')            {                if (Peek() == '"' || Peek() == '\'')                    EatQuotedValue();                else                {                    if (Peek() == '/')                        selfClosing = true;                    MoveAhead();                }            }            MoveAhead();        }        return tag;    }     // Consumes inner content from the current tag    protected void EatInnerContent(string tag)    {        string endTag = "/" + tag;         while (!EndOfText)        {            if (Peek() == '= _html.Length); }    }     // Safely returns the character at the current position    protected char Peek()    {        return (_pos < _html.Length) ? _html[_pos] : (char)0;    }     // Safely advances to current position to the next character    protected void MoveAhead()    {        _pos = Math.Min(_pos + 1, _html.Length);    }     // Moves the current position to the next non-whitespace    // character.    protected void EatWhitespace()    {        while (Char.IsWhiteSpace(Peek()))            MoveAhead();    }     // Moves the current position to the next non-whitespace    // character or the start of the next line, whichever    // comes first    protected void EatWhitespaceToNextLine()    {        while (Char.IsWhiteSpace(Peek()))        {            char c = Peek();            MoveAhead();            if (c == '\n')                break;        }    }     // Moves the current position past a quoted value    protected void EatQuotedValue()    {        char c = Peek();        if (c == '"' || c == '\'')        {            // Opening quote            MoveAhead();            // Find end of value            int start = _pos;            _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);            if (_pos < 0)                _pos = _html.Length;            else                MoveAhead();    // Closing quote        }    }     /// <summary>    /// A StringBuilder class that helps eliminate excess whitespace.    /// </summary>    protected class TextBuilder    {        private StringBuilder _text;        private StringBuilder _currLine;        private int _emptyLine<span style="color:transparent">本文来源gaodai#ma#com搞*!代#%^码网%</span>s;        private bool _preformatted;         // Construction        public TextBuilder()        {            _text = new StringBuilder();            _currLine = new StringBuilder();            _emptyLines = 0;            _preformatted = false;        }         /// <summary>        /// Normally, extra whitespace characters are discarded.        /// If this property is set to true, they are passed        /// through unchanged.        /// </summary>        public bool Preformatted        {            get            {                return _preformatted;            }            set            {                if (value)                {                    // Clear line buffer if changing to                    // preformatted mode                    if (_currLine.Length > 0)                        FlushCurrLine();                    _emptyLines = 0;                }                _preformatted = value;            }        }         /// <summary>        /// Clears all current text.        /// </summary>        public void Clear()        {            _text.Length = 0;            _currLine.Length = 0;            _emptyLines = 0;        }         /// <summary>        /// Writes the given string to the output buffer.        /// </summary>        ///         public void Write(string s)        {            foreach (char c in s)                Write(c);        }         /// <summary>        /// Writes the given character to the output buffer.        /// </summary>        /// Character to write        public void Write(char c)        {            if (_preformatted)            {                // Write preformatted character                _text.Append(c);            }            else            {                if (c == '\r')                {                    // Ignore carriage returns. We'll process                    // '\n' if it comes next                }                else if (c == '\n')                {                    // Flush current line                    FlushCurrLine();                }                else if (Char.IsWhiteSpace(c))                {                    // Write single space character                    int len = _currLine.Length;                    if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))                        _currLine.Append(' ');                }                else                {                    // Add character to current line                    _currLine.Append(c);                }            }        }         // Appends the current line to output buffer        protected void FlushCurrLine()        {            // Get current line            string line = _currLine.ToString().Trim();             // Determine if line contains non-space characters            string tmp = line.Replace(" ", String.Empty);            if (tmp.Length == 0)            {                // An empty line                _emptyLines++;                if (_emptyLines  0)                    _text.AppendLine(line);            }            else            {                // A non-empty line                _emptyLines = 0;                _text.AppendLine(line);            }             // Reset current line            _currLine.Length = 0;        }         /// <summary>        /// Returns the current output as a string.        /// </summary>        public override string ToString()        {            if (_currLine.Length > 0)                FlushCurrLine();            return _text.ToString();        }    }}

搞代码网(gaodaima.com)提供的所有资源部分来自互联网,如果有侵犯您的版权或其他权益,请说明详细缘由并提供版权或权益证明然后发送到邮箱[email protected],我们会在看到邮件的第一时间内为您处理,或直接联系QQ:872152909。本网站采用BY-NC-SA协议进行授权
转载请注明原文链接:C# 将 HTML 转成纯文本
喜欢 (0)
[搞代码]
分享 (0)
发表我的评论
取消评论

表情 贴图 加粗 删除线 居中 斜体 签到

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址