接上文《C#实现将TCP包拼接为完整的HTTP报文》的应用,更进一步,我们需要解析HTTP报文以进行数据分析。

要解析HTTP报文,需要实现以下操作:

  • 读取HTTP报头提供的各种属性
  • 分析属性值,从中获取内容编码和字符集编码
  • 将报头数据和内容进行分离
  • 判断内容是否文本还是二进制,如果是二进制的则不进行处理
  • 如果内容是文本,按报头中提供的内容编码和字符集编码进行解压缩和解码

目前没有找到.Net框架内置的解析方法,理论上HttpClient等类在内部应该已经实现了解析,但不知为何没有公开这些处理方法。(亦或是我没找到)

那么只能自己来解析这些数据了。

我们先来看看这个经过gzip压缩的文本内容的HTTP报文:

image

这里提供一个老外写的简陋的解析类(已经过修改,原代码中存在一些严重BUG):

public enum HTTPHeaderField
    {
        Accept = 0,
        Accept_Charset = 1,
        Accept_Encoding = 2,
        Accept_Language = 3,
        Accept_Ranges = 4,
        Authorization = 5,
        Cache_Control = 6,
        Connection = 7,
        Cookie = 8,
        Content_Length = 9,
        Content_Type = 10,
        Date = 11,
        Expect = 12,
        From = 13,
        Host = 14,
        If_Match = 15,
        If_Modified_Since = 16,
        If_None_Match = 17,
        If_Range = 18,
        If_Unmodified_Since = 19,
        Max_Forwards = 20,
        Pragma = 21,
        Proxy_Authorization = 22,
        Range = 23,
        Referer = 24,
        TE = 25,
        Upgrade = 26,
        User_Agent = 27,
        Via = 28,
        Warn = 29,
        Age = 30,
        Allow = 31,
        Content_Encoding = 32,
        Content_Language = 33,
        Content_Location = 34,
        Content_Disposition = 35,
        Content_MD5 = 36,
        Content_Range = 37,
        ETag = 38,
        Expires = 39,
        Last_Modified = 40,
        Location = 41,
        Proxy_Authenticate = 42,
        Refresh = 43,
        Retry_After = 44,
        Server = 45,
        Set_Cookie = 46,
        Trailer = 47,
        Transfer_Encoding = 48,
        Vary = 49,
        Warning = 50,
        WWW_Authenticate = 51
    };


    class HTTPHeader
    {
        #region PROPERTIES
        private string[] m_StrHTTPField = new string[52];
        private byte[] m_byteData = new byte[4096];

        public string[] HTTPField
        {
            get { return m_StrHTTPField; }
            set { m_StrHTTPField = value; }
        }
        public byte[] Data
        {
            get { return m_byteData; }
            set { m_byteData = value; }
        }
        #endregion
        // convertion
        System.Text.ASCIIEncoding encoding = new System.Text.ASCIIEncoding();


        #region CONSTRUCTEUR
        /// <summary>
        /// Constructeur par défaut - non utilisé
        /// </summary>
        private HTTPHeader()
        { }

        public HTTPHeader(byte[] ByteHTTPRequest)
        {
            string HTTPRequest = encoding.GetString(ByteHTTPRequest);
            try
            {
                int IndexHeaderEnd;
                string Header;

                // Si la taille de requête est supérieur ou égale à 1460, alors toutes la chaine est l'entête http
                if (HTTPRequest.Length <= 1460)
                    Header = HTTPRequest;
                else
                {
                    IndexHeaderEnd = HTTPRequest.IndexOf("rnrn");
                    Header = HTTPRequest.Substring(0, IndexHeaderEnd);
                    Data = ByteHTTPRequest.Skip(IndexHeaderEnd + 4).ToArray();
                }

                HTTPHeaderParse(Header);
            }
            catch (Exception)
            { }
        }
        #endregion

        #region METHODES
        private void HTTPHeaderParse(string Header)
        {
            #region HTTP HEADER REQUEST & RESPONSE

            HTTPHeaderField HHField;
            string HTTPfield, buffer;
            int Index;
            foreach (int IndexHTTPfield in Enum.GetValues(typeof(HTTPHeaderField)))
            {
                HHField = (HTTPHeaderField)IndexHTTPfield;
                HTTPfield = "n" + HHField.ToString().Replace('_', '-') + ": "; //Ajout de n devant pour éviter les doublons entre cookie et set_cookie
                // Si le champ n'est pas présent dans la requête, on passe au champ suivant
                Index = Header.IndexOf(HTTPfield);
                if (Index == -1)
                    continue;

                buffer = Header.Substring(Index + HTTPfield.Length);
                Index = buffer.IndexOf("rn");
                if (Index == -1)
                    m_StrHTTPField[IndexHTTPfield] = buffer.Trim();
                else
                    m_StrHTTPField[IndexHTTPfield] = buffer.Substring(0, Index).Trim();

                //Console.WriteLine("Index = " + IndexHTTPfield + " | champ = " + HTTPfield.Substring(1) + " " + m_StrHTTPField[IndexHTTPfield]);
            }

            // Affichage de tout les champs
            /*for (int j = 0; j < m_StrHTTPField.Length; j++)
            {
                HHField = (HTTPHeaderField)j;
                Console.WriteLine("m_StrHTTPField[" + j + "]; " + HHField + " = " + m_StrHTTPField[j]);
            }
            */
            #endregion

        }
        #endregion
    }

编写以下代码以实现解析文件:

    class Program
    {
        static void Main(string[] args)
        {
            SRART: Console.WriteLine("输入待解析的HTTP报文数据文件完整路径:");
            var filename = Console.ReadLine();
            try
            {
                FileStream fs = new FileStream(filename, FileMode.Open);
                BinaryReader br = new BinaryReader(fs);
                var data = br.ReadBytes((int)fs.Length);
                var header = new HTTPHeader(data);
                var x = 0;
                foreach (var f in header.HTTPField)
                {
                    if (!String.IsNullOrEmpty(f))
                    {
                        Console.WriteLine($"[{x:00}] - {(HTTPHeaderField) x} : {f}");
                    }
                    x++;
                }
                Console.WriteLine($"总数据尺寸{fs.Length}字节,实际数据尺寸{header.Data.Length}字节");
                Console.WriteLine(Encoding.UTF8.GetString(header.Data));
                Console.WriteLine();
                br.Close();
                fs.Close();
            }
            catch (Exception e)
            {
                Console.WriteLine(e);
            }
            goto SRART;
        }
    }

这里还未实现gzip解压缩和字符解码,直接用UTF8解码输出的。(需要时再写吧,都是体力活儿~)

实测效果:

image

image

下面那个是未经gzip压缩过的数据

转载此文章时须注明转载自”SkyD(斯克迪亚)开发者博客“,并保留此文章的Url链接

作者信息

昵称
斯克迪亚

查看其所发布的所有文章

总积分
2420
注册时间
(2018年5月4日 19:06)

评论

目前还没有任何评论。

[切换到移动版页面]