1.题目
请改进本讲中的两个示例中的一个:“背单词”或“网络爬虫”。你可以根据你的想法来改进,以下是一些改进意见可以参考:
“背单词”:界面可以再好看一点;可以去掉音标;可以改变单词显示的速度;可以增加标记生词并记到生词本中;可以增加测试的功能(单词含义可以随机选4个词的含义来让用户选择)等等。
“网络爬虫”:界面可以做成图形化界面;下载时可以判断是不是网页(其中有HTML标记);可以猜测网页的编码(charset);可以避免循环下载(将已下载过的网页记下来);可以处理相对地址;可以记录下来网页中得到的email地址等等。
评分标准:
程序能正常运行,使用了流或文本相关功能(5分);
程序中在示例的基础上增加了功能(4分);
程序有一定的复杂度或较好的创意或较好的界面(1分)。
2.题解
import javax
.swing
.*
;
import java
.awt
.*
;
import java
.awt
.event
.ActionEvent
;
import java
.awt
.event
.ActionListener
;
import java
.net
.MalformedURLException
;
import java
.net
.URL
;
import java
.io
.*
;
import java
.util
.*
;
import java
.util
.List
;
import java
.util
.concurrent
.*
;
import java
.util
.regex
.*
;
import java
.nio
.charset
.*
;
class URLCrawler extends JFrame implements ActionListener
{
public static ConcurrentLinkedQueue
<String> urls
=
new ConcurrentLinkedQueue<>();
JLabel url
= new JLabel("Input the Url", JLabel
.CENTER
);
JTextField turl
= new JTextField("https://www.baidu.com");
JButton crawler
= new JButton("Crawl");
JTextArea tresult
= new JTextArea();
JScrollPane tresultScroll
= new JScrollPane(tresult
,
JScrollPane
.VERTICAL_SCROLLBAR_ALWAYS
,
JScrollPane
.HORIZONTAL_SCROLLBAR_ALWAYS
);
JLabel urlCrawler
= new JLabel("URL Crawler", JLabel
.CENTER
);
JTextField turlCrawler
= new JTextField();
public URLCrawler(){
super("网页爬虫");
turl
.setHorizontalAlignment(JTextField
.CENTER
);
tresult
.setLineWrap(true);
JPanel pnlBody
= new JPanel(new BorderLayout());
pnlBody
.add(BorderLayout
.WEST
, url
);
pnlBody
.add(BorderLayout
.CENTER
, turl
);
pnlBody
.add(BorderLayout
.EAST
, crawler
);
JPanel pnlencod
= new JPanel(new GridLayout());
pnlencod
.add(urlCrawler
);
pnlencod
.add(turlCrawler
);
getContentPane().setLayout(new BorderLayout());
getContentPane().add(BorderLayout
.NORTH
, pnlBody
);
getContentPane().add(BorderLayout
.CENTER
, tresultScroll
);
getContentPane().add(BorderLayout
.SOUTH
, pnlencod
);
crawler
.addActionListener(this);
setDefaultCloseOperation(WindowConstants
.EXIT_ON_CLOSE
);
setSize(400, 400);
setVisible(true);
}
@Override
public void actionPerformed(ActionEvent e
) {
Object source
= e
.getSource();
if(source
== crawler
) {
try {
handlecrawler();
} catch (Exception exception
) {
exception
.printStackTrace();
}
}
else
System
.out
.println("Error");
}
public void handlecrawler() throws Exception
{
urls
.add(turl
.getText());
if(!urls
.isEmpty()){
String url
= urls
.poll();
System
.out
.println("URL:-->:"+url
);
turlCrawler
.setText(String
.valueOf(url
));
String content
= download(new URL(url
), "utf-8");
if(content
.equals("false")){
System
.out
.println("检测到非网页,无HTML标记!");
tresult
.setText("检测到非网页,无HTML标记!");
}
else{
tresult
.setText(content
);
List
<String> moreUrl
= parse( content
);
urls
.addAll(moreUrl
);
}
}
}
static List
<String> parse(String text
) {
String patternString
=
"\\s*href\\s*=\\s*(\"([^\"]*\")|(\'[^\']*\')|([^\'\">\\s]+))\\s*";
Pattern pattern
= Pattern
.compile(patternString
,
Pattern
.CASE_INSENSITIVE
);
Matcher matcher
= pattern
.matcher( text
);
List
<String> list
= new ArrayList<>();
while (matcher
.find()) {
String href
= matcher
.group(1);
href
= href
.replaceAll("\'","").replaceAll("\"","");
if(href
.startsWith("http:") )
list
.add(href
);
}
return list
;
}
static String
download( URL url
, String charset
)
throws Exception
{
try(InputStream input
= url
.openStream();
ByteArrayOutputStream output
= new ByteArrayOutputStream())
{
byte[] data
= new byte[1024];
int length
;
while((length
=input
.read(data
))!=-1){
output
.write(data
,0,length
);
}
byte[] content
= output
.toByteArray();
String str_result
= new String(content
, Charset
.forName(charset
));
if(str_result
.contains("html")){
return str_result
;
}
else{
return "false";
}
}
}
public static void main(String
[] args
)
throws Exception
{
SwingUtilities
.invokeLater(()->{
new URLCrawler();
});
}
}