Java-使用htmlparser爬取手机商城的手机信息

手机信息抓取解决方案

在spring-mvc项目中引入htmlparser.jar包,抓取中关村的手机品牌信息,示例代码如下:

ackage com.luowei.common.service.imp;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Resource;
import org.htmlparser.*;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.nodes.*;
import org.htmlparser.util.*;
import org.htmlparser.tags.*;
import org.springframework.stereotype.Service;
import com.luowei.common.dao.TPhonebrandDAO;
import com.luowei.common.model.Action;
import com.luowei.common.model.TPhonebrand;
import com.luowei.common.service.BaseService;
import com.luowei.common.service.ITPhonebrandService;
    /**
     * @className:TPhonebrandManageService.java
     * @classDescription:手机品牌Service
     * @author:luowei
     * @createTime:2012-1-12
     */
    @Service
    public class TPhonebrandManageService extends BaseService<TPhonebrand> implements ITPhonebrandService
    {
        @Resource(name="TPhonebrandDAO")
        private TPhonebrandDAO tPhonebrandDAO;
        public TPhonebrandDAO gettPhonebrandDAO() {
            return tPhonebrandDAO;
        }
        public void settPhonebrandDAO(TPhonebrandDAO tPhonebrandDAO) {
            this.tPhonebrandDAO = tPhonebrandDAO;
        }
        /**
         * 品牌类别过滤器
         * @author luowei
         * @createTime 2012-1-13
         * @param
         * @return
         */
        class BrandFilter extends StringFilter
        {
            public BrandFilter(String pattern)
            {
                super(pattern);
            }
            public boolean accept(Node node) {
                if(node!=null)
                {
                    if(node instanceof TagNode)
                    {
                        TagNode tn=(TagNode)node;
                        String vl=tn.getAttribute("class");
                        if(vl!=null&&vl.equals(this.getPattern()))
                        {
                            return true;
                        }
                    }
                }
                return false;
            }
        }
        /**
         * 从中关村抓取品牌信息
         * @author luowei
         * @createTime 2012-1-13
         * @param
         * @return
         */
        public List<TPhonebrand> getInforFromZOL()
        {
            try
            {
//初始两个页面解析器
                Parser parser1=new Parser("http://detail.zol.com.cn/category/57.html");
                Parser parser2=new Parser("http://detail.zol.com.cn/category/57.html");
                List<TPhonebrand> tPhonebrandList = new ArrayList<TPhonebrand>();
                String[] manu={"manu normal","manu txt"};
//获取普通品牌
                tPhonebrandList.addAll(findNormalBrand(parser1.parse(new BrandFilter((manu[0])))));
//获取其它品牌
                tPhonebrandList.addAll(findOtherBrand(parser2.parse(new BrandFilter(manu[1]))));
                return tPhonebrandList;
            }
            catch(Exception e)
            {
                e.printStackTrace();
            }
            return null;
        }
        /**
         * 获取普通品牌(class=manu nomal)的品牌
         * manu nomal 下的li子标签如下结构:
         * <li>
         * <a href="/cell_phone_index/subcate57_98_list_1.html">
         * <img alt="Samsung(三星) 手机" src="http://2c.zol-img.com.cn/manu_photo/98_.jpg">
         * </a>
         * <br>
         * <a title="Samsung(三星) 手机" href="/cell_phone_index/subcate57_98_list_1.html">Samsung(三星)</a>
         * </li>
         * @author luowei
         * @createTime 2012-1-13
         * @param
         * @return
         */
        public List<TPhonebrand> findNormalBrand(NodeList nl1)
        {
            List<TPhonebrand> list=new ArrayList<TPhonebrand>();
            for(int i=0;i<nl1.size();i++)
            {
                Node n1=nl1.elementAt(i); //获取第i个class=manu normal的结点
                NodeList nl2=n1.getChildren();
                for(int j=0;j<nl2.size();j++)
                {
                    Node n2=nl2.elementAt(j); //获取li的结点
                    LinkTag _a=(LinkTag)n2.getFirstChild(); //获取嵌套了图片标签的a标签结点
                    // if(n2.getFirstChild().getFirstChild() instanceof org.htmlparser.nodes.TextNode )
                    // {
                    // System.out.println("-----------"+(n2.getFirstChild()).getClass().toString());
                    // String textNode=n2.getFirstChild().getFirstChild().getText();
                    // System.out.println("-----------"+textNode);
                    // }
                    ImageTag _img=(ImageTag)n2.getFirstChild().getFirstChild(); //获取图片标签结点
                    LinkTag _an=(LinkTag)n2.getLastChild(); //获取品牌链接
                    //LinkTag _an=(LinkTag)_a.getNextSibling();
                    String name=_an.getLinkText(); //得到品牌名称
                    String imgpath=_img.getImageURL(); //得到图片地址
                    String url=_a.getLink(); //得到图链接地址
                    TPhonebrand tphonebrand=new TPhonebrand();
                    tphonebrand.setPhonebrandName(name); //保存品牌名称到tphonebrand对象
                    tphonebrand.setPhonebrandImage(imgpath);
                    tphonebrand.setRelateURL(url);
                    list.add(tphonebrand);
                }
            }
            return list;
        }
        /**
         * 获取其它品牌(class=manu txt)的品牌
         * manu txt 下的li子标签如下结构:
         * <li>
         * <a title="BlackBerry(黑莓) 手机" href="/cell_phone_index/subcate57_12772_list_1.html">BlackBerry(黑莓)手机</a>
         * </li>
         * @author luowei
         * @createTime 2012-1-12
         * @param
         * @return
         */
        public List<TPhonebrand> findOtherBrand(NodeList nl1) //
        {
            List<TPhonebrand> list=new ArrayList<TPhonebrand>();
            for(int i=0;i<nl1.size();i++)
            {
                Node n1=nl1.elementAt(i); //定位到第i个class=manu txt的结点
                NodeList nl2=n1.getChildren();
                for(int j=0;j<nl2.size();j++) //遍历manu txt结点下的li结点
                {
                    Node n2=nl2.elementAt(j); //定位到第j个li的结点
                    TPhonebrand tphonebrand=new TPhonebrand();
                    LinkTag _an=(LinkTag)n2.getFirstChild();//获取品牌链接
                    String name=_an.getLinkText().replace("手机", "");
                    String url=_an.getLink();
                    tphonebrand.setPhonebrandName(name);
                    tphonebrand.setRelateURL(url);
                    list.add(tphonebrand);
                }
            }
            return list;
        }
        public boolean alter(Action obj) {
            return false;
        }
        public boolean delete(Action obj) {
            return false;
        }
        public boolean save(Action obj) {
            return false;
        }
        public static void main(String[] args) {
            TPhonebrandManageService tPhonebrandManageService=new TPhonebrandManageService();
            List<TPhonebrand> tPhjonebrandList=tPhonebrandManageService.getInforFromZOL();
            for(TPhonebrand tPhonebrand:tPhjonebrandList)
            {
                System.out.println(tPhonebrand.getPhonebrandName()
                        +"\n"+tPhonebrand.getRelateURL()+"\n----------");
            }
        }
    }

版权所有,转载请注明出处 luowei.github.io.