当前位置: 首页 > 图文教程 > 网络编程 > ASP.NET > ASP.NET实现数据采集

ASP.NET
Asp.net 时间操作基类(支持短日期,长日期,时间差)
asp.net 获取机器硬件信息(cpu频率、磁盘可用空间、内存容量等)
asp.net 数据库备份还原(sqlserver+access)
Asp.Net 数据操作类(附通用数据基类)
Asp.net 弹出对话框基类(输出alet警告框)
Asp.net 文件上传类(取得文件后缀名,保存文件,加入文字水印)
Asp.net Socket客户端(远程发送和接收数据)
Asp.net 字符串操作基类(安全,替换,分解等)
Asp.Net数据输出到EXCEL表格中
asp.net Gridview里添加汇总行
asp.net UpdatePanel的简单用法
asp.net ajaxControlToolkit FilteredTextBoxExtender的简单用法
this connector is disabled错误的解决方法
sql事务应用积累
asp.net Page.Controls对象(找到所有服务器控件)
在asp.NET中字符串替换的五种方法
ASP.NET缓存方法分析和实践示例代码
asp.net 在DNN模块开发中遇到的resx怪问题
ASP.NET State service状态服务的问题解决方法
asp.net 结合mysql存储过程进行分页代码

ASP.NET实现数据采集


出处:互联网   整理: 软晨网(RuanChen.com)   发布: 2009-08-14   浏览: 159 ::
收藏到网摘: n/a

这是我自己写的一个数据采集程序,是采集一个人才网里面人才的信息,第一次写Blog,写的不好不要见笑。

//先按照下面的字段创建一个数据表

以下为引用的内容:
public partial class Form2 : Form
    {
        public Form2()
        {
            InitializeComponent();
        }
        //姓名
        public static string XM = "";
        //年龄
        public static string nl = "";
        //性别
        public static string XB = "";
        //身高
        public static string SG = "";
        //政治面貌
        public static string mm = "";
        //民族
        public static string MZ = "";
        //学历
        public static string XL = "";
        //婚姻状况
        public static string HK = "";
        //所学专业
        public static string ZY = "";
        //工作经验
        public static string GZJY = "";
        //在职单位
        public static string ZZDW = "";
        //在职职位
        public static string ZZZW = "";
        //工作经历
        public static string GZJL = "";
        //要求月薪
        public static string YX = "";
        //工作性质
        public static string GZXZ = "";
        //求职意向
        public static string QZYX = "";
        //具体职务
        public static string JTZW = "";
        //期望工作地
        public static string QWGZD = "";
        //教育情况,语言水平,技术专长
        public static string QT = "";

        private void button1_Click(object sender, EventArgs e)
        {
            label1.Text = "正在采集数据……";

     //遍历数据的页数 
            for (int i = 1; i <=50; i++)
            {
                CJ("http://www.xcjob.cn/renli.asp?pageno=" + i);
            }

            label1.Text = "恭喜你采集完成!";
            MessageBox.Show("恭喜你采集完成!");
        }

        //采集数据
        private void CJ(string Url)
        {
     //获得页面源文件(Html)
            string strWebContent = YM(Url);

            //按照Html里面的标签  取出和数据有关的那段源码
            int iBodyStart = strWebContent.IndexOf("<body", 0);
            int aaa = strWebContent.IndexOf("关键字:", iBodyStart);
            int iTableStart = strWebContent.IndexOf("<table", aaa);
            int iTableEnd = strWebContent.IndexOf("</table>", iTableStart);
            string strWeb = strWebContent.Substring(iTableStart, iTableEnd - iTableStart);

            //生成HtmlDocument
            HtmlElementCollection htmlTR = HtmlTR_Content(strWeb, "tr");

            foreach (HtmlElement tr in htmlTR)
            {
                try
                {
      //姓名
                    XM = tr.GetElementsByTagName("a")[0].InnerText;
      //获得详细信息页面的网址
                    string a = tr.GetElementsByTagName("a")[0].GetAttribute("href").ToString();
                    a = "http://www.xcjob.cn" + a.Substring(11);

                    Content(a);
                }
                catch { }
            }
        }

        //采集详细数据
        private void Content(string URL)
        {
            try
            {
                string strWebContent = YM(URL);

                //按照Html里面的标签 取出和数据有关的那段源码
                int iBodyStart = strWebContent.IndexOf("<body", 0);
                int iTableStart = strWebContent.IndexOf("浏览次数", iBodyStart);
                int iTableEnd = strWebContent.IndexOf("<table", iTableStart);
                int dd = strWebContent.IndexOf("</table>", iTableEnd);
                string strWeb = strWebContent.Substring(iTableEnd, dd - iTableEnd + 8);

                HtmlElementCollection htmlTR = HtmlTR_Content(strWeb, "table");

                foreach (HtmlElement tr in htmlTR)
                {
                    try
                    {
                        //年龄
                        nl = tr.GetElementsByTagName("tr")[1].GetElementsByTagName("td")[1].InnerText;
                        //性别
                        string XB_SG = tr.GetElementsByTagName("tr")[1].GetElementsByTagName("td")[3].InnerText;
                        XB = XB_SG.Substring(0, 1);
                        //身高
                        SG = XB_SG.Substring(11);
                        //政治面貌
                        mm = tr.GetElementsByTagName("tr")[2].GetElementsByTagName("td")[1].InnerText;
                        //民族
                        MZ = tr.GetElementsByTagName("tr")[2].GetElementsByTagName("td")[3].InnerText;
                        //学历
                        XL = tr.GetElementsByTagName("tr")[3].GetElementsByTagName("td")[1].InnerText;
                        //婚烟状况
                        HK = tr.GetElementsByTagName("tr")[3].GetElementsByTagName("td")[3].InnerText;
                        //所学专业
                        ZY = tr.GetElementsByTagName("tr")[5].GetElementsByTagName("td")[1].InnerText;
                        //工作经验
                        GZJY = tr.GetElementsByTagName("tr")[5].GetElementsByTagName("td")[3].InnerText;
                        //在职单位
                        ZZDW = tr.GetElementsByTagName("tr")[6].GetElementsByTagName("td")[1].InnerText;
                        //在职职位
                        ZZZW = tr.GetElementsByTagName("tr")[6].GetElementsByTagName("td")[3].InnerText;
                        //工作经历
                        GZJY = tr.GetElementsByTagName("tr")[7].GetElementsByTagName("td")[1].InnerText;
                        //要求月薪
                        YX = tr.GetElementsByTagName("tr")[9].GetElementsByTagName("td")[1].InnerText;
                        //工作性质
                        GZXZ = tr.GetElementsByTagName("tr")[9].GetElementsByTagName("td")[3].InnerText;
                        //求职意向
                        QZYX = tr.GetElementsByTagName("tr")[10].GetElementsByTagName("td")[1].InnerText;
                        //具体职务
                        JTZW = tr.GetElementsByTagName("tr")[10].GetElementsByTagName("td")[3].InnerText;
                        //期望工作地
                        QWGZD = tr.GetElementsByTagName("tr")[11].GetElementsByTagName("td")[1].InnerText;
                        //教育情况,语言水平,技术专长
                        QT = tr.GetElementsByTagName("tr")[13].GetElementsByTagName("td")[1].InnerText;

                        insert();
                    }
                    catch
                    { }
                }
            }
            catch { }
        }

      //将数据插入数据库 
      private void insert()
        {
            try
            {
                string str = "Provider=Microsoft.Jet.OleDb.4.0;Data Source=Data.mdb";
                string sql = "insert into 人才信息 (姓名,年龄,性别,身高,政治面貌,民族,学历,婚烟状况,所学专业,";
                sql += "工作经验,在职单位,在职职位,工作经历,要求月薪,工作性质,求职意向,具体职务,期望工作地,其他) values ";
                sql += "('" + XM + "'," + nl + ",'" + XB + "','" + SG + "','" + mm + "','" + MZ + "','" + XL + "','" + HK + "','" + ZY + "','" + GZJY + "','" + ZZDW + "','" + ZZZW + "',";
                sql += "'" + GZJY + "','" + YX + "','" + GZXZ + "','" + QZYX + "','" + JTZW + "','" + QWGZD + "','" + QT + "')";

                OleDbConnection con = new OleDbConnection(str);
                OleDbCommand com = new OleDbCommand(sql, con);
                con.Open();
                com.ExecuteNonQuery();
                con.Close();
            }
     catch { }
        }

        //返回一个HtmlElementCollection,然后进行查询内容
        private HtmlElementCollection HtmlTR_Content(string strWeb, string tj)
        {
            try
            {
                //生成HtmlDocument
                WebBrowser webb = new WebBrowser();
                webb.Navigate("about:blank");
                //window.document返回一个htmldocument对象,表示对一个html文档的操作
                //htmldocument对象是在xmldocument基础上建立的,具有xmldocument的一切方法属性
                HtmlDocument htmldoc = webb.Document.OpenNew(true);
                htmldoc.Write(strWeb);
                HtmlElementCollection htmlTR = htmldoc.GetElementsByTagName(tj);

                return htmlTR;
            }
            catch { return null; }
        }


        //获得网址原代码
        private string YM(string Url)
        {
            string strResult = "";

            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
                request.Method = "GET";
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream streamReceive = response.GetResponseStream();
                Encoding encoding = Encoding.GetEncoding("GB2312");
                StreamReader streamReader = new StreamReader(streamReceive, encoding);
                strResult = streamReader.ReadToEnd();
            }
            catch { }

            return strResult;
        }
    }

//这个程序写的不是太好,全都是用for循环遍历出来的,效率不是太高,那位高手可以使用 多线程 指点一下