`

使用httpClient和httpParser获取指定网址的title

    博客分类:
  • java
阅读更多
package com.xinhuanet.cloudDesk.controller;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
import org.htmlparser.Parser;
import org.htmlparser.visitors.HtmlPage;

public class R {
	public static void main(String[] args) throws Exception {

		HttpClient httpClient = new HttpClient();
		httpClient.getHostConfiguration().setProxy("202.84.17.41", 8080);

		HttpConnectionManager httpConnManager = httpClient
				.getHttpConnectionManager();

		if (httpConnManager != null) {
			HttpConnectionManagerParams mgrParams = new HttpConnectionManagerParams();
			mgrParams.setSoTimeout(20000000);
			mgrParams.setTcpNoDelay(true);
			mgrParams.setConnectionTimeout(20000000);
			mgrParams.setLinger(0);
			mgrParams.setStaleCheckingEnabled(false);
			httpConnManager.setParams(mgrParams);
		}

		String url = "http://www.poetry4cn.com";
		GetMethod methodGet = new GetMethod(url);
		httpClient.executeMethod(methodGet);
		String charset = getCharSet(new String(methodGet.getResponseBody()));
		System.out.println("getCharSet:" + charset);
		String responseGet = new String(methodGet.getResponseBody(), charset);
		System.out.println(responseGet);

		

		Parser myParser = Parser.createParser(responseGet.toString(), charset);
		HtmlPage visitor = new HtmlPage(myParser);
		myParser.visitAllNodesWith(visitor);
		String textInPage = visitor.getTitle();
		System.out.println("title:" + textInPage);

	}
	
	public static String getCharSet(String content) {
		// String regex = ".*charset=([^;]*).*";
		String regex = "<meta.+?charset=[^\\w]?([-\\w]+)";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(content);
		if (matcher.find())
			return matcher.group(1);
		else
			return null;
	}

}
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics