本教程介绍如何利用Java中的Jsoup和HttpClient库实现网页的模拟登录,并进一步抓取所需数据。适合需要自动化处理网站信息的开发者学习参考。
Jsoup与HttpClient结合使用进行模拟登录并抓取页面的Java代码如下所示:
```java
package com.app.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.text.SimpleDateFormat;
import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
public class HttpClientHtml {
private static final String SITE = login.goodjobs.cn;
private static final int PORT = 80;
private static final String loginAction = /index.php/action/UserLogin;
private static final String forwardURL = http://user.goodjobs.cn/dispatcher.php/module/Personal/?skip_fill=1;
// 其他静态常量省略
/**
* 模拟登录
*
* @param LOGON_SITE 登录站点地址
* @param LOGON_PORT 端口
* @param login_Action 登录动作URL
* @param params 用户名和密码参数数组,包含两个元素:用户名及密码
*/
private static HttpClient loginHtml(String LOGON_SITE, int LOGON_PORT,
String login_Action, String...params) throws Exception {
HttpClient client = new HttpClient();
client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
// 设置登录参数并执行POST请求
PostMethod postLogin = new PostMethod(login_Action);
NameValuePair[] postData = {new NameValuePair(memberName, params[0]),
new NameValuePair(password, params[1])};
postLogin.setRequestBody(postData);
client.executeMethod(postLogin);
// 输出Cookies信息
CookiePolicy.getDefaultSpec().match(LOGON_SITE, LOGON_PORT, /, false,
client.getState().getCookies()).forEach(cookie -> System.out.println(cookie));
return client;
}
/**
* 使用已登录的客户端获取指定页面内容并保存为文件。
*
* @param client 已经执行过loginHtml方法返回的HttpClient对象
* @param newUrl 需要访问的目标URL地址
*/
private static String createHtml(HttpClient client, String newUrl) throws Exception {
SimpleDateFormat format = new SimpleDateFormat(yyyy-MM-dd);
// 设置文件保存路径与名称
FileUtil.write(client.executeMethod(new PostMethod(newUrl)).getResponseBodyAsString(),
toUrl + format.format(new Date())+ _ + 1 + .html);
System.out.println(\n写入文件成功!);
}
/**
* 使用Jsoup解析HTML代码,并对特定标签进行处理。
*
* @param filePath HTML文档路径
* @param random 随机数,用于生成新的输出文件名
*/
private static String JsoupFile(String filePath, int random) {
SimpleDateFormat format = new SimpleDateFormat(yyyy-MM-dd);
try (BufferedReader in = new BufferedReader(new FileReader(filePath));
BufferedWriter out = new BufferedWriter(
new OutputStreamWriter(new FileOutputStream(toUrl + format.format(new Date())+ _new_ + random+ .html), gbk))) {
Document doc = Jsoup.parse(in, GBK);
// 处理link标签
Elements links = doc.select(link);
for (Element link : links) {
String hrefAttr = link.attr(href).replace(/personal.css, css);
if (hrefAttr.contains(/personal.css)) {
Element hrefVal=link.attr(href, hrefAttr);
out.write(hrefVal.toString());
}
}
// 处理script标签
Elements scripts = doc.select(script);
for (Element js : scripts) {
String jsrc = js.attr(src).replace(/scripts/fValidate/fValidate.one.js, _JS);
if (jsrc.contains(/fValidate.one.js)) {
Element val=js.attr(src, jsrc);
out.write(val.toString());
}
}
// 处理body标签中的所有元素
Elements body = doc.getElementsByTag(body);
for (Element tag : body.select(*)) {
String src = tag.attr(src).replace(/images, Img);
if (src.contains(/images)) {
tag.attr(src, src);
out.write(body.toString());
}
}
} catch (IOException e) {
e.printStackTrace();
}
return toUrl + format.format(new Date())+ _new_ + random + .html;
}
public static void main(String[] args) throws Exception {
String [] params={admin,admin123};
HttpClient client = loginHtml(SITE, PORT, loginAction,params);
createHtml(client, forwardURL