热词云

爬取代码:

1 import requests
  2 from bs4 import BeautifulSoup
  3 import bs4
  4 # -*- coding: UTF-8 -*
  5 from urllib.request import urlopen
  6 from pdfminer.pdfinterp import PDFResourceManager, process_pdf
  7 from pdfminer.converter import TextConverter
  8 from pdfminer.layout import LAParams
  9 from io import StringIO
 10 from pyhanlp import *
 11 import time
 12 
 13 import requests
 14 import json
 15 from pymysql import *
 16 
 17 #连接数据库的方法
 18 def connectDB():
 19     try:
 20         db=connect(host='localhost',port=3306,user='root',password='123456',db='python')
 21         print("数据库连接成功")
 22         return db
 23     except Exception as e:
 24         print(e)
 25     return NULL
 26 
 27 db = connectDB()
 28 
 29 #向数据库中插入数据的方法
 30 def insertInformation(title,abstract,keywords,href):
 31     cursor=db.cursor()
 32     try:
 33         cursor.execute("insert into new_table(title,abstract,keywords,href) values('%s','%s','%s','%s')" % (title,abstract,keywords,href))
 34         print("插入成功")
 35         db.commit()
 36         cursor.close()
 37         return True
 38     except Exception as e:
 39         print(e)
 40         db.rollback()
 41     return False
 42 
 43 list_href=[]
 44 list_title=[]
 45 
 46 def getHtmlText(url):
 47     r = requests.get(url)
 48     r.raise_for_status()
 49     r.encoding = r.apparent_encoding
 50     html = r.text
 51     return html
 52 
 53 
 54 
 55 def getDataFromHtml(list,html):
 56     bs = BeautifulSoup(html, "lxml")
 57     for td in bs.tbody.find_all("td"):
 58         if isinstance(td,bs4.element.Tag):
 59             for a in td.find_all("a"):
 60                 list_href.append(a['href'])
 61                 list_title.append(a.text)
 62 
 63 def showAll(list):
 64     for univ in list:
 65         print(univ)
 66 
 67 
 68 def readPDF(pdfFile):
 69     rsrcmgr = PDFResourceManager()
 70     retstr = StringIO()
 71     laparams = LAParams()
 72     device = TextConverter(rsrcmgr, retstr, laparams=laparams)
 73     process_pdf(rsrcmgr, device, pdfFile)
 74     device.close()
 75     content = retstr.getvalue()
 76     retstr.close()
 77     return content
 78 
 79 if __name__ == '__main__':
 80     url = "https://blog.csdn.net/u014636245/article/details/91426736"
 81     try:
 82         html = getHtmlText(url)
 83         getDataFromHtml(list,html)
 84         for i in range(0,len(list_title)):
 85             print(i)
 86             pdfFile = urlopen(list_href[i])
 87             # 远程
 88             outputString = readPDF(pdfFile)
 89             if "Abstract" in outputString:
 90                 document = ""
 91                 if "1. Introduction" in outputString and "Abstract" in outputString:
 92                     document = outputString[outputString.index("Abstract"):outputString.index("1. Introduction")]
 93                 elif "1.Introduction" in outputString and "Abstract" in outputString:
 94                     document = outputString[outputString.index("Abstract"):outputString.index("1.Introduction")]
 95                 else :
 96                     document = outputString[outputString.index("Abstract"):outputString.index("Abstract")+800]
 97                 # print(document)
 98                 keywords = HanLP.extractKeyword(document, 10)
 99                 print(keywords)
100                 str = ""
101                 for k in keywords:
102                     str+=k+" "
103                 pdfFile.close()
104                 insertInformation(list_title[i],document,str,list_href[i])
105             time.sleep(0.1)
106     except Exception as e:
107         print(e)
108         print("爬取失败")

py

结果:

热词云

有很多很多条 ,关键词是每个keyword里面有10个关键词;

然后就是将他们从数据库中取出来放在数组中,然后再进行排序,找最大;

不要忘记将介词等无用词去掉;

进行排序最简单的是使用的map

// 排序
List<Map.Entry<String ,Integer>> list = new ArrayList<Map.Entry<String,Integer>>(map.entrySet());
//在java中,如果要对集合对象或数组对象进行排序,需要实现Comparator接口以达到我们想要的目标
Comparator<Map.Entry<String,Integer>> comparator = new Comparator<Map.Entry<String, Integer>>() {
    public int compare(Map.Entry<String, Integer> left, Map.Entry<String, Integer> right) {
        return (left.getValue().compareTo(right.getValue()));
    }
};
// 集合默认升序升序
Collections.sort(list,comparator);
String ten[]=new String[50];
int shu[]=new int[50];
for(int i=0;i<50;i++){// 由高到低输出
    
 ten[i]=list.get(list.size()-i-1).getKey();
 shu[i]=list.get(list.size()-i-1).getValue();
 
 Tu tu =new Tu();
 tu.name=ten[i];
 tu.value=shu[i];
 list_tu.add(tu);
    System.out.println(list.get(list.size()-i-1).getKey() +":"+list.get(list.size()-i-1).getValue());
}

然后设置一个点击事件,转换成json的代码形式

Gson gson = new Gson();
String json = gson.toJson(list_tu);
response.getWriter().write(json);

然后使用echarts设计热词云

<%@ page language="java" contentType="text/html; charset=UTF-8"
    pageEncoding="UTF-8"%>
<%@ taglib uri="http://java.sun.com/jsp/jstl/core" prefix="c"%>
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Insert title here</title>
<link rel="stylesheet" href="css/bootstrap.min.css" type="text/css" />
<script src="js/jquery-1.11.3.min.js" type="text/javascript"></script>
<script type="text/javascript" src="js/echarts.min.js"></script>
<script type="text/javascript" src="js/china.js"></script>
<script src="js/bootstrap.min.js" type="text/javascript"></script>
<script src='https://cdn.bootcss.com/echarts/3.7.0/echarts.simple.js'></script>
<script src='js/echarts-wordcloud.js'></script>
</head>
<body>
<div ></div>
<div>
  <table class="table" style=" 100%;align-content: center;" >
    <tr>
      <th align="center">论文连接</th>
    </tr>
    <c:forEach var="item" items="${list}">
      <tr>
        <td><a href="${item.lianjie }">${item.title}</a></td>
      </tr>
    </c:forEach>
  </table>
</div>
<script>
  var chart = echarts.init(document.getElementById('main'));
  var dt;
  $.ajax({
    url : "PaperServlet_",
    async : false,
    type : "POST",
    success : function(data) {
      dt = data;
     // alert(dt[0].title);
    },
    error : function() {
      alert("请求失败");
    },
    dataType : "json"
  });
  var mydata = new Array(0);
  for (var i = 0; i < dt.length; i++) {
      var d = {};

      d["name"] = dt[i].name;
      //alert(dt[i].name);
      d["value"] = dt[i].value;
      mydata.push(d);
  }
  var option = {
    tooltip: {},
    series: [ {
      type: 'wordCloud',
      gridSize: 2,
      sizeRange: [20, 50],
      rotationRange: [-90, 90],
      shape: 'pentagon',
       600,
      height: 300,
      drawOutOfBound: true,
      textStyle: {
        normal: {
          color: function () {
            return 'rgb(' + [
              Math.round(Math.random() * 160),
              Math.round(Math.random() * 160),
              Math.round(Math.random() * 160)
            ].join(',') + ')';
          }
        },
        emphasis: {
          shadowBlur: 10,
          shadowColor: '#333'
        }
      },
      data: mydata
    } ]
  };

  chart.setOption(option);
  chart.on('click', function (params) {
      var url = "ClickServlet?geunjian=" + params.name;
      window.location.href = url;
    });
  window.onresize = chart.resize;
</script>
</body>
</html>

然后点击热词后携带此热词到servlet,再从数据库中找出论文的关键字中包含此热词的论文列表

import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import com.me.dao.LWDao;
import com.me.domain.LunWen;

/**
 * Servlet implementation class ClickServlet
 */
@WebServlet("/ClickServlet")
public class ClickServlet extends HttpServlet {
    private static final long serialVersionUID = 1L;
    LWDao dao = new LWDao();

    public ClickServlet() {
        super();
        // TODO Auto-generated constructor stub
    }

    protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        String geunjian = request.getParameter("geunjian");
        System.out.println(geunjian);
        List<LunWen> guan = new ArrayList<LunWen>();
        try {
            guan = dao.login(geunjian);
        } catch (SQLException e) {
            e.printStackTrace();
        }
        for(int i=0;i<guan.size();i++) {
            if(guan.get(i).getLianjie()!=null) {
                String ss = guan.get(i).getLianjie().substring(6,guan.get(i).getLianjie().length());
                guan.get(i).setLianjie("http://openaccess.thecvf.com/"+ss);
            }

        }
        request.setAttribute("list", guan);
        System.out.println(guan.size());
        request.getRequestDispatcher("lw.jsp").forward(request, response);
    }

    /**
     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
     */
    protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
        // TODO Auto-generated method stub
        doGet(request, response);
    }

}