Java-使用htmlparser爬取手机商城的手机信息
手机信息抓取解决方案
在spring-mvc项目中引入htmlparser.jar包,抓取中关村的手机品牌信息,示例代码如下:
ackage com.luowei.common.service.imp;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Resource;
import org.htmlparser.*;
import org.htmlparser.filters.StringFilter;
import org.htmlparser.nodes.*;
import org.htmlparser.util.*;
import org.htmlparser.tags.*;
import org.springframework.stereotype.Service;
import com.luowei.common.dao.TPhonebrandDAO;
import com.luowei.common.model.Action;
import com.luowei.common.model.TPhonebrand;
import com.luowei.common.service.BaseService;
import com.luowei.common.service.ITPhonebrandService;
/**
* @className:TPhonebrandManageService.java
* @classDescription:手机品牌Service
* @author:luowei
* @createTime:2012-1-12
*/
@Service
public class TPhonebrandManageService extends BaseService<TPhonebrand> implements ITPhonebrandService
{
@Resource(name="TPhonebrandDAO")
private TPhonebrandDAO tPhonebrandDAO;
public TPhonebrandDAO gettPhonebrandDAO() {
return tPhonebrandDAO;
}
public void settPhonebrandDAO(TPhonebrandDAO tPhonebrandDAO) {
this.tPhonebrandDAO = tPhonebrandDAO;
}
/**
* 品牌类别过滤器
* @author luowei
* @createTime 2012-1-13
* @param
* @return
*/
class BrandFilter extends StringFilter
{
public BrandFilter(String pattern)
{
super(pattern);
}
public boolean accept(Node node) {
if(node!=null)
{
if(node instanceof TagNode)
{
TagNode tn=(TagNode)node;
String vl=tn.getAttribute("class");
if(vl!=null&&vl.equals(this.getPattern()))
{
return true;
}
}
}
return false;
}
}
/**
* 从中关村抓取品牌信息
* @author luowei
* @createTime 2012-1-13
* @param
* @return
*/
public List<TPhonebrand> getInforFromZOL()
{
try
{
//初始两个页面解析器
Parser parser1=new Parser("http://detail.zol.com.cn/category/57.html");
Parser parser2=new Parser("http://detail.zol.com.cn/category/57.html");
List<TPhonebrand> tPhonebrandList = new ArrayList<TPhonebrand>();
String[] manu={"manu normal","manu txt"};
//获取普通品牌
tPhonebrandList.addAll(findNormalBrand(parser1.parse(new BrandFilter((manu[0])))));
//获取其它品牌
tPhonebrandList.addAll(findOtherBrand(parser2.parse(new BrandFilter(manu[1]))));
return tPhonebrandList;
}
catch(Exception e)
{
e.printStackTrace();
}
return null;
}
/**
* 获取普通品牌(class=manu nomal)的品牌
* manu nomal 下的li子标签如下结构:
* <li>
* <a href="/cell_phone_index/subcate57_98_list_1.html">
* <img alt="Samsung(三星) 手机" src="http://2c.zol-img.com.cn/manu_photo/98_.jpg">
* </a>
* <br>
* <a title="Samsung(三星) 手机" href="/cell_phone_index/subcate57_98_list_1.html">Samsung(三星)</a>
* </li>
* @author luowei
* @createTime 2012-1-13
* @param
* @return
*/
public List<TPhonebrand> findNormalBrand(NodeList nl1)
{
List<TPhonebrand> list=new ArrayList<TPhonebrand>();
for(int i=0;i<nl1.size();i++)
{
Node n1=nl1.elementAt(i); //获取第i个class=manu normal的结点
NodeList nl2=n1.getChildren();
for(int j=0;j<nl2.size();j++)
{
Node n2=nl2.elementAt(j); //获取li的结点
LinkTag _a=(LinkTag)n2.getFirstChild(); //获取嵌套了图片标签的a标签结点
// if(n2.getFirstChild().getFirstChild() instanceof org.htmlparser.nodes.TextNode )
// {
// System.out.println("-----------"+(n2.getFirstChild()).getClass().toString());
// String textNode=n2.getFirstChild().getFirstChild().getText();
// System.out.println("-----------"+textNode);
// }
ImageTag _img=(ImageTag)n2.getFirstChild().getFirstChild(); //获取图片标签结点
LinkTag _an=(LinkTag)n2.getLastChild(); //获取品牌链接
//LinkTag _an=(LinkTag)_a.getNextSibling();
String name=_an.getLinkText(); //得到品牌名称
String imgpath=_img.getImageURL(); //得到图片地址
String url=_a.getLink(); //得到图链接地址
TPhonebrand tphonebrand=new TPhonebrand();
tphonebrand.setPhonebrandName(name); //保存品牌名称到tphonebrand对象
tphonebrand.setPhonebrandImage(imgpath);
tphonebrand.setRelateURL(url);
list.add(tphonebrand);
}
}
return list;
}
/**
* 获取其它品牌(class=manu txt)的品牌
* manu txt 下的li子标签如下结构:
* <li>
* <a title="BlackBerry(黑莓) 手机" href="/cell_phone_index/subcate57_12772_list_1.html">BlackBerry(黑莓)手机</a>
* </li>
* @author luowei
* @createTime 2012-1-12
* @param
* @return
*/
public List<TPhonebrand> findOtherBrand(NodeList nl1) //
{
List<TPhonebrand> list=new ArrayList<TPhonebrand>();
for(int i=0;i<nl1.size();i++)
{
Node n1=nl1.elementAt(i); //定位到第i个class=manu txt的结点
NodeList nl2=n1.getChildren();
for(int j=0;j<nl2.size();j++) //遍历manu txt结点下的li结点
{
Node n2=nl2.elementAt(j); //定位到第j个li的结点
TPhonebrand tphonebrand=new TPhonebrand();
LinkTag _an=(LinkTag)n2.getFirstChild();//获取品牌链接
String name=_an.getLinkText().replace("手机", "");
String url=_an.getLink();
tphonebrand.setPhonebrandName(name);
tphonebrand.setRelateURL(url);
list.add(tphonebrand);
}
}
return list;
}
public boolean alter(Action obj) {
return false;
}
public boolean delete(Action obj) {
return false;
}
public boolean save(Action obj) {
return false;
}
public static void main(String[] args) {
TPhonebrandManageService tPhonebrandManageService=new TPhonebrandManageService();
List<TPhonebrand> tPhjonebrandList=tPhonebrandManageService.getInforFromZOL();
for(TPhonebrand tPhonebrand:tPhjonebrandList)
{
System.out.println(tPhonebrand.getPhonebrandName()
+"\n"+tPhonebrand.getRelateURL()+"\n----------");
}
}
}