100字范文,内容丰富有趣,生活中的好帮手!
100字范文 > java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

时间:2023-06-30 17:17:06

相关推荐

java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

java jsoup 网络爬虫 学习例子(八)京东和淘宝商品比价 PhantomJS

/** filename getHtml.js* phantomjs.exe 2.0.0* author InJavaWeTrust*/var system = require('system');var address = '';if (system.args.length != 2) {console.log('Try to pass two args when invoking this script!');phantom.exit();} else {address = system.args[1];}var page = require('webpage').create();var url = address;phantom.outputEncoding = 'GBK';page.open(url, function (status) {if (status !== 'success') {console.log('Failed to get the page!');} else {console.log(page.content);}phantom.exit();});package com.iteye.injavawetrust.phantomjs;import java.util.List;/*** * @author InJavaWeTrust**/public interface ProductList {/*** 爬取商品列表* @return*/public List<ProductInfo> getProductList();}package com.iteye.injavawetrust.phantomjs;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/*** * @author InJavaWeTrust**/public class TBProductList implements ProductList{private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();private String tbUrl;private String productName;public TBProductList(String tbUrl, String productName) {this.tbUrl = tbUrl;this.productName = productName;}@Overridepublic List<ProductInfo> getProductList() {List<ProductInfo> tbProductList = new ArrayList<ProductInfo>();ProductInfo productInfo = null;String url = "";int page = 0;for(int i = 0; i < 10; i++){try {System.out.println("TB Product 第[" + (i + 1) + "]页");if(i == 0){url = tbUrl;}else{page += 44;url = Constants.TBURL + pcu.getUrlCode(productName) + Constants.TBPAGE + page;}System.out.println(url);Document doc = Jsoup.parse(pcu.getHtmlByPhantomjs(url));Elements itemlist = doc.select("div[class=m-itemlist]");Iterator<Element> it = itemlist.iterator();while(it.hasNext()){Element item = it.next();Elements items = item.select("div[data-category=auctions]");Iterator<Element> one = items.iterator();while(one.hasNext()){Element e = one.next();Elements price = e.select("div[class=price g_price g_price-highlight]>strong");String productPrice = price.text();Elements title = e.select("div[class=row row-2 title]>a");String productName = title.text();productInfo = new ProductInfo();productInfo.setProductName(productName);productInfo.setProductPrice(productPrice);tbProductList.add(productInfo);}}} catch(Exception e) {System.out.println("Get TB product has error");System.out.println(e.getMessage());}}return tbProductList;}public static void main(String[] args) {try{String productName = "铅笔";String tbUrl = Constants.TBURL + pcu.getUrlCode(productName);List<ProductInfo> list = new TBProductList(tbUrl, productName).getProductList();for(ProductInfo pi : list){System.out.println("[" + pi.getProductName() + "] [" + pi.getProductPrice() + "]");}}catch(Exception e){e.printStackTrace();}}}package com.iteye.injavawetrust.phantomjs;import java.util.ArrayList;import java.util.Iterator;import java.util.List;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;/*** * @author InJavaWeTrust**/public class JDProductList implements ProductList{private String jdUrl;private String productName;private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();public JDProductList(String jdUrl, String productName){this.jdUrl = jdUrl;this.productName = productName;}@Overridepublic List<ProductInfo> getProductList() {List<ProductInfo> jdProductList = new ArrayList<ProductInfo>();ProductInfo productInfo = null;String url = "";for(int i = 0; i < 10; i++){try {System.out.println("JD Product 第[" + (i + 1) + "]页");if(i == 0) {url = jdUrl;}else{url = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC + Constants.JDPAGE + (i + 1);}System.out.println(url);Document document = Jsoup.connect(url).timeout(5000).get();Elements uls = document.select("ul[class=gl-warp clearfix]");Iterator<Element> ulIter = uls.iterator();while(ulIter.hasNext()) {Element ul = ulIter.next();Elements lis = ul.select("li[data-sku]");Iterator<Element> liIter = lis.iterator();while(liIter.hasNext()) {Element li = liIter.next();Element div = li.select("div[class=gl-i-wrap]").first();Elements title = div.select("div[class=p-name p-name-type-2]>a");String productName = title.attr("title"); //得到商品名称Elements price = div.select(".p-price>strong");String productPrice =price.attr("data-price"); //得到商品价格productInfo = new ProductInfo();productInfo.setProductName(productName);productInfo.setProductPrice(productPrice);jdProductList.add(productInfo);}}} catch(Exception e) {System.out.println("Get JD product has error [" + url + "]");System.out.println(e.getMessage());}}return jdProductList;}public static void main(String[] args) {try {String productName = "书包";String jdUrl = Constants.JDURL + pcu.getGbk(productName) + Constants.JDENC;List<ProductInfo> list = new JDProductList(jdUrl, productName).getProductList();System.out.println(list.size());for(ProductInfo pi : list){System.out.println(pi.getProductName() + " " + pi.getProductPrice());}} catch (Exception e) {e.printStackTrace();}}}package com.iteye.injavawetrust.phantomjs;/*** * @author InJavaWeTrust**/public class Constants {/*** JDURL*/public static String JDURL = "/Search?keyword=";/*** JD汉字编码格式*/public static String JDENC = "&enc=utf-8";/*** JD分页*/public static String JDPAGE ="&page=";/*** TBURL*/public static String TBURL = "/search?q=";/*** 淘宝分页*/public static String TBPAGE = "&s=";/*** 超时时间*/public static int TIMEOUT = 50000;/*** 获取页面script*/public static String SCRIPT = "E:\\InJavaWeTrust\\js\\getHtml.js ";/*** phantomjs.exe path*/public static String PHANTOMJSPATH = "D:\\Program Files\\phantomjs\\bin\\phantomjs.exe ";}package com.iteye.injavawetrust.phantomjs;import java.io.Serializable;import java.util.Date;/*** * @author InJavaWeTrust**/public class ProductInfo implements Serializable{private static final long serialVersionUID = 8179244535272774089L;/*** 商品ID*/private String productid;/*** 商品名称*/private String productName;/*** 商品价格*/private String productPrice;/*** 月销售笔数*/private String tradeNum;/*** 商品URL*/private String productUrl;/*** 商品网店名称*/private String shopName;/*** 电商名称*/private String ecName;/*** 爬取入库日期*/private Date date;public String getProductid() {return productid;}public void setProductid(String productid) {this.productid = productid;}public String getProductName() {return productName;}public void setProductName(String productName) {this.productName = productName;}public String getProductPrice() {return productPrice;}public void setProductPrice(String productPrice) {this.productPrice = productPrice;}public String getTradeNum() {return tradeNum;}public void setTradeNum(String tradeNum) {this.tradeNum = tradeNum;}public String getProductUrl() {return productUrl;}public void setProductUrl(String productUrl) {this.productUrl = productUrl;}public String getShopName() {return shopName;}public void setShopName(String shopName) {this.shopName = shopName;}public String getEcName() {return ecName;}public void setEcName(String ecName) {this.ecName = ecName;}public Date getDate() {return date;}public void setDate(Date date) {this.date = date;}}package com.iteye.injavawetrust.phantomjs;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.UnsupportedEncodingException;import .URL;import .URLEncoder;import java.text.SimpleDateFormat;import java.util.List;import java.util.TimeZone;import mons.logging.LogFactory;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.HttpMethod;import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.WebRequest;import com.gargoylesoftware.htmlunit.html.HtmlPage;/*** * @author InJavaWeTrust**/public class PriceCheckUtil {private PriceCheckUtil() {}private static final PriceCheckUtil instance = new PriceCheckUtil();public static PriceCheckUtil getInstance() {return instance;}/*** 商品汉字转码* @param productName 商品名称* @return*/public String getGbk(String productName){String retGbk = "";try {retGbk = new String(productName.getBytes("UTF-8"), "GBK");} catch (UnsupportedEncodingException e) {e.printStackTrace();}return retGbk;}/*** 对淘宝浏览器汉字进行转换* @param productName 商品名称* @return*/public String getUrlCode(String productName){String retUrlCode = "";try {retUrlCode = URLEncoder.encode(productName, "utf8");} catch (UnsupportedEncodingException e) {e.printStackTrace();}return retUrlCode;}/*** 从列表list中找到与productName相似度最高的ProductInfo** @param productName* @param list* @return 相似度最高的productName*/public ProductInfo getSimilarity(String productName, List<ProductInfo> list) {ProductInfo productInfo = null;/*** 找到list中所有的productName与字符串productName的相似度,保存在lens数组中*/double lens[] = new double[list.size()];for (int i = 0; i < list.size() - 1; i++) {lens[i] = sim(productName, list.get(i).getProductName());}/*** 遍历出最大的相似度maxLen*/double maxLen = 0.0;for (int i = 0; i < lens.length; i++) {if (maxLen < lens[i]) {maxLen = lens[i];}}/*** 遍历出最大的相似度的索引maxLenIndex*/int maxLenIndex = 0;for (int i = 0; i < lens.length; i++) {if (maxLen == lens[i]) {maxLenIndex = i;}}productInfo = list.get(maxLenIndex);return productInfo;}/*** 求三个数中最小的一个* @param one* @param two* @param three* @return*/public int min(int one, int two, int three) {int min = one;if(two < min) {min = two;}if(three < min) {min = three;}return min;}/*** 计算矢量距离* Levenshtein Distance(LD)* @param str1* @param str2* @return*/public int ld(String str1, String str2) {int d[][]; //矩阵int n = str1.length();int m = str2.length();int i; //遍历str1的int j; //遍历str2的char ch1; //str1的char ch2; //str2的int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1if(n == 0) {return m;}if(m == 0) {return n;}d = new int[n+1][m+1];for(i=0; i<=n; i++) { //初始化第一列d[i][0] = i;}for(j=0; j<=m; j++) { //初始化第一行d[0][j] = j;}for(i=1; i<=n; i++) { //遍历str1ch1 = str1.charAt(i-1);//去匹配str2for(j=1; j<=m; j++) {ch2 = str2.charAt(j-1);if(ch1 == ch2) {temp = 0;} else {temp = 1;}//左边+1,上边+1, 左上角+temp取最小d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+temp);}}return d[n][m];}/*** 计算相似度* @param str1* @param str2* @return*/public double sim(String str1, String str2) {int ld = ld(str1, str2);return 1 - (double) ld / Math.max(str1.length(), str2.length());}/** * 毫秒转换成hhmmss * @param ms 毫秒 * @return hh:mm:ss */ public String msToss(long ms) {SimpleDateFormat formatter = new SimpleDateFormat("HH:mm:ss"); formatter.setTimeZone(TimeZone.getTimeZone("GMT+00:00")); String ss = formatter.format(ms); return ss; }/*** 禁止htmlunit日志输出*/public void offLog(){LogFactory.getFactory().setAttribute("mons.logging.Log","mons.logging.impl.NoOpLog");}/*** 获取淘宝数据* @param url* @return* @throws Exception*/public String getXmlByHtmlunit(String url) throws Exception {offLog();String ret = "";WebClient webClient = new WebClient(BrowserVersion.CHROME);// 1 启动JSwebClient.getOptions().setJavaScriptEnabled(true);// 2 禁用Css,可避免自动二次请求CSS进行渲染webClient.getOptions().setCssEnabled(false);// 3 启动客户端重定向webClient.getOptions().setRedirectEnabled(true);// 4 JS运行错误时,是否抛出异常webClient.getOptions().setThrowExceptionOnScriptError(false);// 5AJAX supportwebClient.setAjaxController(new NicelyResynchronizingAjaxController());// 6 设置超时webClient.getOptions().setTimeout(Constants.TIMEOUT);WebRequest webRequest = new WebRequest(new URL(url));webRequest.setHttpMethod(HttpMethod.GET);HtmlPage page = webClient.getPage(webRequest);webClient.waitForBackgroundJavaScript(10000);ret = page.asXml();webClient.close();return ret;}/*** 通过Phantomjs得到html页面* @param url* @return*/public String getHtmlByPhantomjs(String url) {StringBuilder html = new StringBuilder();try {Runtime rt = Runtime.getRuntime();Process p = rt.exec(Constants.PHANTOMJSPATH + Constants.SCRIPT + url);InputStream is = p.getInputStream();BufferedReader br = new BufferedReader(new InputStreamReader(is));String tmp = "";while ((tmp = br.readLine()) != null) {html.append(tmp);}} catch (IOException e) {e.printStackTrace();}return html.toString();}}package com.iteye.injavawetrust.phantomjs;import java.util.ArrayList;import java.util.HashMap;import java.util.List;import java.util.Map;import java.util.Scanner;/*** * @author InJavaWeTrust**/public class PriceCheckMain {private static PriceCheckUtil pcu = PriceCheckUtil.getInstance();public List<Map<String, ProductInfo>> getProductList(String productName) {String jdUrl = Constants.JDURL + productName + Constants.JDENC;String tbUrl = Constants.TBURL + productName;return getProductFromUrls(jdUrl, tbUrl, productName);}public List<Map<String, ProductInfo>> getProductFromUrls(String jdUrl, String tbUrl, String productName) {List<Map<String, ProductInfo>> retListMap = new ArrayList<Map<String,ProductInfo>>();List<ProductInfo> jdProductList = new JDProductList(jdUrl, productName).getProductList();List<ProductInfo> tbProductList = new TBProductList(tbUrl, productName).getProductList();for(int i = 0; i < jdProductList.size(); i++){String jdProductName = jdProductList.get(i).getProductName();Map<String, ProductInfo> map = new HashMap<String, ProductInfo>();map.put("JD", jdProductList.get(i));ProductInfo tbProduct = pcu.getSimilarity(jdProductName, tbProductList);map.put("TB", tbProduct);retListMap.add(map);}return retListMap;}public static void main(String[] args) {System.out.println("输入商品名称:");Scanner scanner = new Scanner(System.in);String productName = scanner.next();scanner.close();System.out.println("京东和淘宝[" + productName + "]商品比价开始。。。。。。");try{long starTime = System.currentTimeMillis();List<Map<String, ProductInfo>> list = new PriceCheckMain().getProductList(productName);for(Map<String, ProductInfo> map : list){String jdName = map.get("JD").getProductName();String jdPrice = map.get("JD").getProductPrice();String ddName = map.get("TB").getProductName();String ddPrice = map.get("TB").getProductPrice();System.out.println("[" + jdName + "] [" + ddName + "]");System.out.println("[" + jdPrice + "] [" + ddPrice + "]");System.out.println("-----------------------------------------------------------");}long endTime = System.currentTimeMillis();System.out.println("用时 [" + pcu.msToss(endTime - starTime) + "]");}catch(Exception e){System.out.println("error");System.out.println(e.getMessage());}}}

运行结果:

输入商品名称:

铅笔

京东和淘宝[铅笔]商品比价开始。。。。。。

JD Product 第[1]页

/Search?keyword=铅笔&enc=utf-8

JD Product 第[2]页

/Search?keyword=閾呯瑪&enc=utf-8&page=2

。。。。。。。。。。。。

TB Product 第[1]页

/search?q=铅笔

TB Product 第[2]页

/search?q=%E9%93%85%E7%AC%94&s=44

。。。。。。。。。。。。。。。。。

[马可9002铅笔 马克三角铅 笔易握正姿木杆 安全无毒2H HB 2B HB HB] [马可9001铅笔 三角形杆橡皮头 学生写字铅笔 HB 2B 满28元包邮]

[12.00] [8.96]

-----------------------------------------------------------

用时 [00:01:35]

本内容不代表本网观点和政治立场,如有侵犯你的权益请联系我们处理。
网友评论
网友评论仅供其表达个人看法,并不表明网站立场。