网络爬虫,搜索引擎为了让自己的数据库足够的强大,没日没夜的在网络上寻找信息,以使自己的信息更全面。大家都知道互联网信息是无穷的,是爆炸式的增长,他们不可能手工索取信息,他们写一小程序不停的在网络上获取信息,于是网络爬虫便产生了。
下面我用java实现了一个简单的专门抓取邮箱的小工具,做得非常粗略,仅供大家参考,
这是效果图
?
?啥也不说了直接上代码吧
?
import java.awt.BorderLayout; import java.awt.Dimension; import java.awt.Image; import java.awt.MenuItem; import java.awt.PopupMenu; import java.awt.Toolkit; import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.awt.event.WindowAdapter; import java.awt.event.WindowEvent; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.net.HttpURLConnection; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.swing.ImageIcon; import javax.swing.JButton; import javax.swing.JComboBox; import javax.swing.JFrame; import javax.swing.JLabel; import javax.swing.JOptionPane; import javax.swing.JPanel; import javax.swing.JScrollPane; import javax.swing.JTextArea; import javax.swing.JTextField; import javax.swing.UIManager; import javax.swing.UnsupportedLookAndFeelException; /** * * @author http://javaflex.iteye.com/ * */ public class MainFrm extends JFrame implements ActionListener { private static final long serialVersionUID = 1L; static int count=1; static int countUrl=1; JFrame frame; JButton b1; JButton b2; JTextArea t1; JTextField tf; JPanel panel; JScrollPane jScrollPane1; JLabel label; JComboBox comb; PopupMenu pm; List<Thread> t = new ArrayList<Thread>(); static int m = 0; MainFrm into() { pm = new PopupMenu(); MenuItem openItem = new MenuItem("1.打 开"); MenuItem closeItem = new MenuItem("2.退 出"); MenuItem aboutItem = new MenuItem("3.关 于"); openItem.addActionListener(this); closeItem.addActionListener(this); aboutItem.addActionListener(this); pm.add(openItem); pm.add(closeItem); pm.add(aboutItem); String[] petStrings = { "Baidu", "Google", "Yahoo", "Bing", "Sogou" }; comb = new JComboBox(petStrings); java.net.URL imgURL = MainFrm.class.getResource("mail.png"); ImageIcon imageicon = new ImageIcon(imgURL); panel = new JPanel(); tf = new JTextField(50); tf.setText("留下邮箱"); label = new JLabel("关键字:"); frame = new JFrame("邮箱抓取(注:抓取深度暂时默认为2) QQ:三二八二四七六七六"); frame.setIconImage(imageicon.getImage()); b1 = new JButton("提取邮箱"); b1.addActionListener(this); b2 = new JButton("停止抓取"); b2.addActionListener(this); t1 = new JTextArea(); t1.setLineWrap(true); jScrollPane1 = new JScrollPane(t1); jScrollPane1.setPreferredSize(new Dimension(200, 200)); this.setDefaultCloseOperation(DO_NOTHING_ON_CLOSE); frame.addWindowListener(new WindowAdapter() { // 窗口关闭事件 public void windowClosing(WindowEvent e) { System.exit(0); }; public void windowIconified(WindowEvent e) { // 窗口最小化事件 frame.setVisible(false); systemTray(); } }); panel.add(label); panel.add(tf); panel.add(comb); panel.add(b1); panel.add(b2); frame.getContentPane().add(panel, BorderLayout.NORTH); frame.getContentPane().add(jScrollPane1, BorderLayout.CENTER); frame.setSize(300, 400); frame.pack(); frame.setVisible(true); Dimension winSize = Toolkit.getDefaultToolkit().getScreenSize(); frame.setLocation((winSize.width - frame.getWidth()) / 2, (winSize.height - frame.getHeight()) / 2); frame.setAlwaysOnTop(true); return this; } public static void main(String[] args) throws ClassNotFoundException, InstantiationException, IllegalAccessException, UnsupportedLookAndFeelException { // TODO Auto-generated method stub UIManager.setLookAndFeel(UIManager.getSystemLookAndFeelClassName()); new MainFrm().into().systemTray(); } @SuppressWarnings({ "unchecked", "deprecation", "static-access" }) @Override public void actionPerformed(ActionEvent e) { if ("提取邮箱".equals(e.getActionCommand())) { count=1; t1.setText(""); // get("http://dzh.mop.com/whbm/20060109/4/lSgg8I6063c68aS3.shtml"); String http = ""; int combo = (comb.getSelectedIndex()); switch (combo) { case 0: http = "http://www.baidu.com/s?wd="; break; case 1: http = "http://www.google.com.hk/search?num=50&q="; break; case 2: http = "http://www.yahoo.cn/s?q="; break; case 3: http = "http://cn.bing.com/search?q="; break; case 4: http = "http://www.sogou.com/web?query="; break; default: http = "http://www.baidu.com/s?wd="; break; } final List<Map> list = get(http + tf.getText()); m = list.size(); for (int i = 0, n = list.size(); i < n; i++) { final Map map = list.get(i); Thread tt = new Thread() { public void run() { Iterator<String> iterator = map.values().iterator(); while (iterator.hasNext()) { String u=iterator.next(); get(u); } } }; t.add(tt); tt.start(); } } else if ("终止抓取".equals(e.getActionCommand())) { for (int i = 0; i < t.size(); i++) { t.get(i).stop(); } } else if ("1.打 开".equals(e.getActionCommand())) { frame.setVisible(true); frame.setExtendedState(frame.NORMAL); } else if ("2.退 出".equals(e.getActionCommand())) { System.exit(0); }else if ("3.关 于".equals(e.getActionCommand())) { JOptionPane.showMessageDialog(null, "本程序仅供初学参考 QQ:三二八二四七六七六"); } } @SuppressWarnings("unchecked") public List<Map> get(String urlStr) { List<Map> list = new ArrayList<Map>(); try { URL url = new URL(urlStr); URLConnection rulConnection = url.openConnection(); HttpURLConnection httpUrlConnection = (HttpURLConnection) rulConnection; httpUrlConnection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); BufferedReader br = new BufferedReader(new InputStreamReader( httpUrlConnection.getInputStream())); String line = ""; while ((line = br.readLine()) != null) { Map map = pr(line); list.add(map); } } catch (FileNotFoundException e) { //e.printStackTrace(); } catch (IOException e) { //e.printStackTrace(); } finally { m--; if (m <= 0) { // JOptionPane.showMessageDialog(null, "提取结束"); } } return list; } @SuppressWarnings("unchecked") public Map pr(String aa) { Pattern pattern = Pattern .compile("[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\\.[a-zA-Z0-9_-]+)+"); Pattern pattern2 = Pattern .compile("(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&:/~\\+#]*[\\w\\-\\@?^=%&/~\\+#])?"); Matcher matcher = pattern.matcher(aa); Map<String, String> emailMap = new HashMap<String, String>(); Matcher matcher2 = pattern2.matcher(aa); Map<String, String> httpMap = new HashMap<String, String>(); while (matcher2.find()) { httpMap.put(matcher2.group(), matcher2.group()); } while (matcher.find()) { emailMap.put(matcher.group(), matcher.group()); } Iterator<String> iterator = emailMap.values().iterator(); while (iterator.hasNext()) { String str = iterator.next(); t1.append("第"+(count++)+"个:"+str + "\r\n"); } return httpMap; } public void systemTray() { try { if (java.awt.SystemTray.isSupported()) { final java.awt.SystemTray st = java.awt.SystemTray .getSystemTray(); Image image = Toolkit.getDefaultToolkit().getImage( getClass().getResource("email_go.png")); final java.awt.TrayIcon ti = new java.awt.TrayIcon(image); ti.setToolTip("邮箱抓取"); ti.setPopupMenu(pm); st.add(ti); } } catch (Exception e) { } } public String toString(){ new MainFrm().into(); return null; } }
?@author http://javaflex.iteye.com/
?
自动发送邮件的功能(待续)
?