实例介绍
【实例简介】
【实例截图】
【实例截图】
【核心代码】
namespace AngleSharp { using AngleSharp.Extensions; using AngleSharp.Html; using System; using System.Collections.Generic; using System.Text; /// <summary> /// Various HTML encoding helpers. /// </summary> static class TextEncoding { #region Fields static readonly Dictionary<String, Encoding> encodings = new Dictionary<String, Encoding>(StringComparer.OrdinalIgnoreCase); static readonly Dictionary<String, Encoding> suggestions = new Dictionary<String, Encoding>(StringComparer.OrdinalIgnoreCase); #endregion #region Encodings /// <summary> /// Gets the UTF-8 encoding. /// </summary> public static readonly Encoding Utf8 = Encoding.UTF8; /// <summary> /// Gets the UTF-16 (Big Endian) encoding. /// </summary> public static readonly Encoding Utf16Be = Encoding.BigEndianUnicode; /// <summary> /// Gets the UTF-16 (Little Endian) encoding. /// </summary> public static readonly Encoding Utf16Le = Encoding.Unicode; /// <summary> /// Gets the UTF-32 (Little Endian) encoding. /// </summary> public static readonly Encoding Utf32Le = GetEncoding("UTF-32LE"); /// <summary> /// Gets the UTF-32 (Little Endian) encoding. /// </summary> public static readonly Encoding Utf32Be = GetEncoding("UTF-32BE"); /// <summary> /// Gets the chinese government standard encoding. /// </summary> public static readonly Encoding Gb18030 = GetEncoding("GB18030"); /// <summary> /// Gets the Big5 encoding. /// </summary> public static readonly Encoding Big5 = GetEncoding("big5"); /// <summary> /// Gets the Windows-1252 encoding. /// </summary> public static readonly Encoding Windows1252 = GetEncoding("windows-1252"); #endregion #region Initialization static TextEncoding() { encodings.Add("unicode-1-1-utf-8", Utf8); encodings.Add("utf-8", Utf8); encodings.Add("utf8", Utf8); encodings.Add("utf-16be", Utf16Be); encodings.Add("utf-16", Utf16Le); encodings.Add("utf-16le", Utf16Le); var windows874 = GetEncoding("windows-874"); encodings.Add("dos-874", windows874); encodings.Add("iso-8859-11", windows874); encodings.Add("iso8859-11", windows874); encodings.Add("iso885911", windows874); encodings.Add("tis-620", windows874); encodings.Add("windows-874", windows874); var windows1250 = GetEncoding("windows-1250"); encodings.Add("cp1250", windows1250); encodings.Add("windows-1250", windows1250); encodings.Add("x-cp1250", windows1250); var windows1251 = GetEncoding("windows-1251"); encodings.Add("cp1251", windows1251); encodings.Add("windows-1251", windows1251); encodings.Add("x-cp1251", windows1251); encodings.Add("x-user-defined", Windows1252); encodings.Add("ansi_x3.4-1968", Windows1252); encodings.Add("ascii", Windows1252); encodings.Add("cp1252", Windows1252); encodings.Add("cp819", Windows1252); encodings.Add("csisolatin1", Windows1252); encodings.Add("ibm819", Windows1252); encodings.Add("iso-8859-1", Windows1252); encodings.Add("iso-ir-100", Windows1252); encodings.Add("iso8859-1", Windows1252); encodings.Add("iso88591", Windows1252); encodings.Add("iso_8859-1", Windows1252); encodings.Add("iso_8859-1:1987", Windows1252); encodings.Add("l1", Windows1252); encodings.Add("latin1", Windows1252); encodings.Add("us-ascii", Windows1252); encodings.Add("windows-1252", Windows1252); encodings.Add("x-cp1252", Windows1252); var windows1253 = GetEncoding("windows-1253"); encodings.Add("cp1253", windows1253); encodings.Add("windows-1253", windows1253); encodings.Add("x-cp1253", windows1253); var windows1254 = GetEncoding("windows-1254"); encodings.Add("cp1254", windows1254); encodings.Add("csisolatin5", windows1254); encodings.Add("iso-8859-9", windows1254); encodings.Add("iso-ir-148", windows1254); encodings.Add("iso8859-9", windows1254); encodings.Add("iso88599", windows1254); encodings.Add("iso_8859-9", windows1254); encodings.Add("iso_8859-9:1989", windows1254); encodings.Add("l5", windows1254); encodings.Add("latin5", windows1254); encodings.Add("windows-1254", windows1254); encodings.Add("x-cp1254", windows1254); var windows1255 = GetEncoding("windows-1255"); encodings.Add("cp1255", windows1255); encodings.Add("windows-1255", windows1255); encodings.Add("x-cp1255", windows1255); var windows1256 = GetEncoding("windows-1256"); encodings.Add("cp1256", windows1256); encodings.Add("windows-1256", windows1256); encodings.Add("x-cp1256", windows1256); var windows1257 = GetEncoding("windows-1257"); encodings.Add("cp1257", windows1257); encodings.Add("windows-1257", windows1257); encodings.Add("x-cp1257", windows1257); var w1258 = GetEncoding("windows-1258"); encodings.Add("cp1258", w1258); encodings.Add("windows-1258", w1258); encodings.Add("x-cp1258", w1258); var macintosh = GetEncoding("macintosh"); encodings.Add("csmacintosh", macintosh); encodings.Add("mac", macintosh); encodings.Add("macintosh", macintosh); encodings.Add("x-mac-roman", macintosh); var maccyrillic = GetEncoding("x-mac-cyrillic"); ; encodings.Add("x-mac-cyrillic", maccyrillic); encodings.Add("x-mac-ukrainian", maccyrillic); var i866 = GetEncoding("cp866"); encodings.Add("866", i866); encodings.Add("cp866", i866); encodings.Add("csibm866", i866); encodings.Add("ibm866", i866); var latin2 = GetEncoding("iso-8859-2"); encodings.Add("csisolatin2", latin2); encodings.Add("iso-8859-2", latin2); encodings.Add("iso-ir-101", latin2); encodings.Add("iso8859-2", latin2); encodings.Add("iso88592", latin2); encodings.Add("iso_8859-2", latin2); encodings.Add("iso_8859-2:1987", latin2); encodings.Add("l2", latin2); encodings.Add("latin2", latin2); var latin3 = GetEncoding("iso-8859-3"); encodings.Add("csisolatin3", latin3); encodings.Add("iso-8859-3", latin3); encodings.Add("iso-ir-109", latin3); encodings.Add("iso8859-3", latin3); encodings.Add("iso88593", latin3); encodings.Add("iso_8859-3", latin3); encodings.Add("iso_8859-3:1988", latin3); encodings.Add("l3", latin3); encodings.Add("latin3", latin3); var latin4 = GetEncoding("iso-8859-4"); encodings.Add("csisolatin4", latin4); encodings.Add("iso-8859-4", latin4); encodings.Add("iso-ir-110", latin4); encodings.Add("iso8859-4", latin4); encodings.Add("iso88594", latin4); encodings.Add("iso_8859-4", latin4); encodings.Add("iso_8859-4:1988", latin4); encodings.Add("l4", latin4); encodings.Add("latin4", latin4); var latin5 = GetEncoding("iso-8859-5"); encodings.Add("csisolatincyrillic", latin5); encodings.Add("cyrillic", latin5); encodings.Add("iso-8859-5", latin5); encodings.Add("iso-ir-144", latin5); encodings.Add("iso8859-5", latin5); encodings.Add("iso88595", latin5); encodings.Add("iso_8859-5", latin5); encodings.Add("iso_8859-5:1988", latin5); var latin6 = GetEncoding("iso-8859-6"); encodings.Add("arabic", latin6); encodings.Add("asmo-708", latin6); encodings.Add("csiso88596e", latin6); encodings.Add("csiso88596i", latin6); encodings.Add("csisolatinarabic", latin6); encodings.Add("ecma-114", latin6); encodings.Add("iso-8859-6", latin6); encodings.Add("iso-8859-6-e", latin6); encodings.Add("iso-8859-6-i", latin6); encodings.Add("iso-ir-127", latin6); encodings.Add("iso8859-6", latin6); encodings.Add("iso88596", latin6); encodings.Add("iso_8859-6", latin6); encodings.Add("iso_8859-6:1987", latin6); var latin7 = GetEncoding("iso-8859-7"); encodings.Add("csisolatingreek", latin7); encodings.Add("ecma-118", latin7); encodings.Add("elot_928", latin7); encodings.Add("greek", latin7); encodings.Add("greek8", latin7); encodings.Add("iso-8859-7", latin7); encodings.Add("iso-ir-126", latin7); encodings.Add("iso8859-7", latin7); encodings.Add("iso88597", latin7); encodings.Add("iso_8859-7", latin7); encodings.Add("iso_8859-7:1987", latin7); encodings.Add("sun_eu_greek", latin7); var latin8 = GetEncoding("iso-8859-8"); encodings.Add("csiso88598e", latin8); encodings.Add("csisolatinhebrew", latin8); encodings.Add("hebrew", latin8); encodings.Add("iso-8859-8", latin8); encodings.Add("iso-8859-8-e", latin8); encodings.Add("iso-ir-138", latin8); encodings.Add("iso8859-8", latin8); encodings.Add("iso88598", latin8); encodings.Add("iso_8859-8", latin8); encodings.Add("iso_8859-8:1988", latin8); encodings.Add("visual", latin8); var latini = GetEncoding("iso-8859-8-i"); encodings.Add("csiso88598i", latini); encodings.Add("iso-8859-8-i", latini); encodings.Add("logical", latini); var latin13 = GetEncoding("iso-8859-13"); encodings.Add("iso-8859-13", latin13); encodings.Add("iso8859-13", latin13); encodings.Add("iso885913", latin13); var latin15 = GetEncoding("iso-8859-15"); encodings.Add("csisolatin9", latin15); encodings.Add("iso-8859-15", latin15); encodings.Add("iso8859-15", latin15); encodings.Add("iso885915", latin15); encodings.Add("iso_8859-15", latin15); encodings.Add("l9", latin15); var kr = GetEncoding("koi8-r"); encodings.Add("cskoi8r", kr); encodings.Add("koi", kr); encodings.Add("koi8", kr); encodings.Add("koi8-r", kr); encodings.Add("koi8_r", kr); encodings.Add("koi8-u", GetEncoding("koi8-u")); var chinese = GetEncoding("x-cp20936"); encodings.Add("chinese", chinese); encodings.Add("csgb2312", chinese); encodings.Add("csiso58gb231280", chinese); encodings.Add("gb2312", chinese); encodings.Add("gb_2312", chinese); encodings.Add("gb_2312-80", chinese); encodings.Add("gbk", chinese); encodings.Add("iso-ir-58", chinese); encodings.Add("x-gbk", chinese); encodings.Add("hz-gb-2312", GetEncoding("hz-gb-2312")); encodings.Add("gb18030", Gb18030); var big5 = GetEncoding("big5"); encodings.Add("big5", big5); encodings.Add("big5-hkscs", big5); encodings.Add("cn-big5", big5); encodings.Add("csbig5", big5); encodings.Add("x-x-big5", big5); var isojp = GetEncoding("iso-2022-jp"); encodings.Add("csiso2022jp", isojp); encodings.Add("iso-2022-jp", isojp); var isokr = GetEncoding("iso-2022-kr"); encodings.Add("csiso2022kr", isokr); encodings.Add("iso-2022-kr", isokr); var isocn = GetEncoding("iso-2022-cn"); encodings.Add("iso-2022-cn", isocn); encodings.Add("iso-2022-cn-ext", isocn); encodings.Add("shift_jis", GetEncoding("shift_jis")); var eucjp = Encoding.GetEncoding("euc-jp"); encodings.Add("euc-jp", eucjp); suggestions.Add("ar", Utf8); suggestions.Add("cy", Utf8); suggestions.Add("fa", Utf8); suggestions.Add("hr", Utf8); suggestions.Add("kk", Utf8); suggestions.Add("mk", Utf8); suggestions.Add("or", Utf8); suggestions.Add("ro", Utf8); suggestions.Add("sr", Utf8); suggestions.Add("vi", Utf8); suggestions.Add("be", latin5); suggestions.Add("bg", windows1251); suggestions.Add("ru", windows1251); suggestions.Add("uk", windows1251); suggestions.Add("cs", latin2); suggestions.Add("hu", latin2); suggestions.Add("pl", latin2); suggestions.Add("sl", latin2); suggestions.Add("tr", windows1254); suggestions.Add("ku", windows1254); suggestions.Add("he", windows1255); suggestions.Add("lv", GetEncoding("iso-8859-13")); // Windows-31J ???? Replaced by something better anyway suggestions.Add("ja", Utf8); suggestions.Add("ko", GetEncoding("ks_c_5601-1987")); suggestions.Add("lt", windows1257); suggestions.Add("sk", windows1250); suggestions.Add("th", windows874); } #endregion #region Extensions /// <summary> /// Checks if the provided encoding is any UTF-16 encoding. /// </summary> /// <param name="encoding">The encoding to check.</param> /// <returns>The result of the check (UTF-16BE, UTF-16LE).</returns> public static Boolean IsUnicode(this Encoding encoding) { return encoding == Utf16Be || encoding == Utf16Le; } #endregion #region Methods /// <summary> /// Tries to extract the encoding from the given http-equiv content string. /// </summary> /// <param name="content">The value of the attribute.</param> /// <returns>The extracted encoding or null if the encoding is invalid.</returns> public static Encoding Parse(String content) { var encoding = String.Empty; var position = 0; content = content.ToLowerInvariant(); for (int i = position; i < content.Length - 7; i ) { if (content.Substring(i).StartsWith(AttributeNames.Charset)) { position = i 7; break; } } if (position > 0 && position < content.Length) { for (int i = position; i < content.Length - 1; i ) { if (content[i].IsSpaceCharacter()) position ; else break; } if (content[position] != Symbols.Equality) return Parse(content.Substring(position)); position ; for (int i = position; i < content.Length; i ) { if (content[i].IsSpaceCharacter()) position ; else break; } if (position < content.Length) { if (content[position] == Symbols.DoubleQuote) { content = content.Substring(position 1); var index = content.IndexOf(Symbols.DoubleQuote); if (index != -1) encoding = content.Substring(0, index); } else if (content[position] == Symbols.SingleQuote) { content = content.Substring(position 1); var index = content.IndexOf(Symbols.SingleQuote); if (index != -1) encoding = content.Substring(0, index); } else { content = content.Substring(position); var index = 0; for (int i = 0; i < content.Length; i ) { if (content[i].IsSpaceCharacter()) break; else if (content[i] == ';') break; else index ; } encoding = content.Substring(0, index); } } } if (!IsSupported(encoding)) return null; return Resolve(encoding); } /// <summary> /// Detects if a valid encoding has been found in the given charset string. /// </summary> /// <param name="charset">The parsed charset string.</param> /// <returns>True if a valid encdoing has been found, otherwise false.</returns> public static Boolean IsSupported(String charset) { return encodings.ContainsKey(charset); } /// <summary> /// Resolves an Encoding instance given by the charset string. /// If the desired encoding is not found (or supported), then /// UTF-8 will be returned. /// </summary> /// <param name="charset">The charset string.</param> /// <returns>An instance of the Encoding class or null.</returns> public static Encoding Resolve(String charset) { Encoding encoding; if (charset != null && encodings.TryGetValue(charset, out encoding)) return encoding; return Utf8; } /// <summary> /// Suggests an Encoding for the given local. /// </summary> /// <param name="local">The local defined by the BCP 47 language tag.</param> /// <returns>The suggested encoding.</returns> public static Encoding Suggest(String local) { if (!String.IsNullOrEmpty(local) && local.Length > 1) { Encoding encoding; if (suggestions.TryGetValue(local.Substring(0, 2), out encoding)) return encoding; else if (local.Equals("zh-cn", StringComparison.OrdinalIgnoreCase)) return Gb18030; else if (local.Equals("zh-tw", StringComparison.OrdinalIgnoreCase)) return Big5; } return Windows1252; } /// <summary> /// Gets the encoding for lesser used charsets. This might result in an /// exception depending on the platform (mostly Windows Phone *). /// Exceptions are handled by returning UTF8. That should work well in /// most scenarios. /// </summary> /// <param name="name">The name of the charset.</param> /// <returns>The encoding for the given charset.</returns> static Encoding GetEncoding(String name) { try { return Encoding.GetEncoding(name); } catch { // We use a catch em all since WP8 does throw a different exception than W*. return Utf8; } } #endregion #region Punycode const Int32 PunycodeBase = 36; const Int32 tmin = 1; const Int32 tmax = 26; static readonly String AcePrefix = "xn--"; static readonly Char[] PossibleDots = { '.', '\u3002', '\uFF0E', '\uFF61' }; static Boolean IsSupplementary(Int32 test) { return test >= 0x10000; } static Boolean IsDot(Char c) { return PossibleDots.Contains(c); } static Char EncodeDigit(Int32 d) { // 26-35 map to ASCII 0-9 if (d > 25) return (Char)(d - 26 '0'); // 0-25 map to a-z or A-Z return (Char)(d 'a'); } static Char EncodeBasic(Char bcp) { if (Char.IsUpper(bcp)) bcp = (Char)('a' - 'A'); return bcp; } static Int32 AdaptChar(Int32 delta, Int32 numpoints, Boolean firsttime) { const Int32 Skew = 38; const Int32 Damp = 700; var k = 0u; delta = firsttime ? delta / Damp : delta / 2; delta = delta / numpoints; for (k = 0; delta > ((PunycodeBase - tmin) * tmax) / 2; k = PunycodeBase) delta /= PunycodeBase - tmin; return (Int32)(k (PunycodeBase - tmin 1) * delta / (delta Skew)); } public static String PunycodeEncode(String unicode) { const Int32 InitialBias = 72; const Int32 InitialNumber = 0x80; const Int32 MaxIntValue = 0x7ffffff; const Int32 LabelLimit = 63; const Int32 DefaultNameLimit = 255; // 0 length strings aren't allowed if (unicode.Length == 0) return unicode; var output = new StringBuilder(unicode.Length); var iNextDot = 0; var iAfterLastDot = 0; var iOutputAfterLastDot = 0; // Find the next dot while (iNextDot < unicode.Length) { // Find end of this segment iNextDot = unicode.IndexOfAny(PossibleDots, iAfterLastDot); if (iNextDot < 0) iNextDot = unicode.Length; // Only allowed to have empty . section at end (www.microsoft.com.) if (iNextDot == iAfterLastDot) break; // We'll need an Ace prefix output.Append(AcePrefix); var basicCount = 0; var numProcessed = 0; for (basicCount = iAfterLastDot; basicCount < iNextDot; basicCount ) { if (unicode[basicCount] < 0x80) { output.Append(EncodeBasic(unicode[basicCount])); numProcessed ; } else if (Char.IsSurrogatePair(unicode, basicCount)) basicCount ; } var numBasicCodePoints = numProcessed; if (numBasicCodePoints == iNextDot - iAfterLastDot) { output.Remove(iOutputAfterLastDot, AcePrefix.Length); } else { // If it has some non-basic code points the input cannot start with xn-- if (unicode.Length - iAfterLastDot >= AcePrefix.Length && unicode.Substring(iAfterLastDot, AcePrefix.Length).Equals(AcePrefix, StringComparison.OrdinalIgnoreCase)) break; // Need to do ACE encoding var numSurrogatePairs = 0; // Add a delimiter (-) if we had any basic code points (between basic and encoded pieces) if (numBasicCodePoints > 0) output.Append('-'); // Initialize the state var n = InitialNumber; var delta = 0; var bias = InitialBias; // Main loop while (numProcessed < (iNextDot - iAfterLastDot)) { var j = 0; var m = 0; var test = 0; for (m = MaxIntValue, j = iAfterLastDot; j < iNextDot; j = IsSupplementary(test) ? 2 : 1) { test = unicode.ConvertToUtf32(j); if (test >= n && test < m) m = test; } /* Increase delta enough to advance the decoder's */ /* <n,i> state to <m,0>, but guard against overflow: */ delta = (m - n) * ((numProcessed - numSurrogatePairs) 1); n = m; for (j = iAfterLastDot; j < iNextDot; j = IsSupplementary(test) ? 2 : 1) { // Make sure we're aware of surrogates test = unicode.ConvertToUtf32(j); // Adjust for character position (only the chars in our string already, some // haven't been processed. if (test < n) delta ; if (test == n) { // Represent delta as a generalized variable-length integer: int q, k; for (q = delta, k = PunycodeBase; ; k = PunycodeBase) { int t = k <= bias ? tmin : k >= bias tmax ? tmax : k - bias; if (q < t) break; output.Append(EncodeDigit(t (q - t) % (PunycodeBase - t))); q = (q - t) / (PunycodeBase - t); } output.Append(EncodeDigit(q)); bias = AdaptChar(delta, (numProcessed - numSurrogatePairs) 1, numProcessed == numBasicCodePoints); delta = 0; numProcessed ; if (IsSupplementary(m)) { numProcessed ; numSurrogatePairs ; } } } delta; n; } } // Make sure its not too big if (output.Length - iOutputAfterLastDot > LabelLimit) throw new ArgumentException(); // Done with this segment, add dot if necessary if (iNextDot != unicode.Length) output.Append(PossibleDots[0]); iAfterLastDot = iNextDot 1; iOutputAfterLastDot = output.Length; } var rest = IsDot(unicode[unicode.Length - 1]) ? 0 : 1; var maxlength = DefaultNameLimit - rest; // Throw if we're too long if (output.Length > maxlength) output.Remove(maxlength, output.Length - maxlength); return output.ToString(); } #endregion } }
好例子网口号:伸出你的我的手 — 分享!
网友评论
小贴士
感谢您为本站写下的评论,您的评论对其它用户来说具有重要的参考价值,所以请认真填写。
- 类似“顶”、“沙发”之类没有营养的文字,对勤劳贡献的楼主来说是令人沮丧的反馈信息。
- 相信您也不想看到一排文字/表情墙,所以请不要反馈意义不大的重复字符,也请尽量不要纯表情的回复。
- 提问之前请再仔细看一遍楼主的说明,或许是您遗漏了。
- 请勿到处挖坑绊人、招贴广告。既占空间让人厌烦,又没人会搭理,于人于己都无利。
关于好例子网
本站旨在为广大IT学习爱好者提供一个非营利性互相学习交流分享平台。本站所有资源都可以被免费获取学习研究。本站资源来自网友分享,对搜索内容的合法性不具有预见性、识别性、控制性,仅供学习研究,请务必在下载后24小时内给予删除,不得用于其他任何用途,否则后果自负。基于互联网的特殊性,平台无法对用户传输的作品、信息、内容的权属或合法性、安全性、合规性、真实性、科学性、完整权、有效性等进行实质审查;无论平台是否已进行审查,用户均应自行承担因其传输的作品、信息、内容而可能或已经产生的侵权或权属纠纷等法律责任。本站所有资源不代表本站的观点或立场,基于网友分享,根据中国法律《信息网络传播权保护条例》第二十二与二十三条之规定,若资源存在侵权或相关问题请联系本站客服人员,点此联系我们。关于更多版权及免责申明参见 版权及免责申明
支持(0) 盖楼(回复)