keyan/te_u/arxiv.py

151 lines
5.1 KiB
Python
Raw Normal View History

2024-06-17 14:04:28 +08:00
import undetected_chromedriver as uc
import time
import random
import json
import matplotlib.pyplot as plt # 数据可视化
import jieba # 词语切割
import wordcloud # 分词
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS # 词云,颜色生成器,停止词
import numpy as np # 科学计算
from PIL import Image # 处理图片
from bs4 import BeautifulSoup
from lxml import etree
# def get_current_page_result(driver):
# """ 采集一页里的所有item """
# result_area = driver.find_element(by="id", value="ModuleSearchResult")
# current_page_results = result_area.find_elements(by="xpath", value='//tbody/tr')
#
# names = [r.find_element(by="xpath", value='td[@class="name"]') for r in current_page_results]
# links = [r.find_element(by="xpath", value='td[@class="name"]/a').get_attribute("href") for r in current_page_results]
#
# items = get_items(driver, links)
# return items
def get_items(driver, links):
items = []
for i, l in enumerate(links):
item = get_item(driver, l)
items.append(item)
return items
def get_item(driver, link):
item = {}
driver.get(link) # 获取新的论文链接
time.sleep(5 + 3 * random.random()) # 等等加载完成
# 标题
name = driver.find_element(by="xpath", value='//h1[contains(@class, "title")]').text
item["name"] = name
# 作者
names_element = driver.find_elements(by="xpath", value='//div[@class="authors"]//a')
names = [n_ele.text for n_ele in names_element]
item["authors"] = ",".join(names)
# 单位
item["affiliations"] = "no"
# 摘要
# 如果有更多,先点更多
# try:
# more_bn = driver.find_element(by="id", value="ChDivSummaryMore")
# more_bn.click()
# time.sleep(1 + 1 * random.random()) # 等等加载完成
# except:
# more_bn = None
abstract_area = driver.find_element(by="xpath", value='//blockquote[contains(@class, "abstract")]')
abstract = abstract_area.text
item["abstract"] = abstract
return item
def get_links_etree(driver):
dom = etree.HTML(driver.page_source)
links = dom.xpath('//ol[@class="breathe-horizontal"]/li/div/p/a/@href')
return links
def get_news_from_arxiv(total_num, keyword):
keyword = [i.strip() for i in keyword.strip().split()]
url = f"https://arxiv.org/search/?query={'+'.join(keyword)}&searchtype=all&source=header"
driver = uc.Chrome()
driver.get(url)
# time.sleep(3 + 2 * random.random()) # 等等加载完成
# # 搜索
# input_button = driver.find_element(by="id", value="txt_SearchText")
# input_button.send_keys(keyword)
# time.sleep(1 + 1 * random.random()) # 等等加载完成
#
# search_bn = driver.find_element(by="xpath", value='//input[@class="search-btn"]')
# search_bn.click()
time.sleep(5 + 3 * random.random()) # 等等加载完成
# 获取相应的链接
links = []
stop_flag = False
while not stop_flag:
link_current_page = get_links_etree(driver)
links.extend(link_current_page)
if len(links) < total_num:
# 下一页
try:
next_page_btn = driver.find_element(by="xpath", value='//a[@class="pagination-next"]')
next_page_btn.click()
time.sleep(2 + 2 * random.random()) # 等等加载完成
# driver.refresh()
# time.sleep(2 + 2 * random.random()) # 等等加载完成
except Exception as e:
print("没有下一页,返回当前的采集的所有结果", e)
stop_flag = True
total_num = len(links)
else:
# 超过了需要的连接数就停止
stop_flag = True
links = links[:total_num]
results = get_items(driver, links)
with open(f"result_arxiv_{'_'.join(keyword)}.json", "w", encoding="utf8") as f:
f.write(json.dumps(results))
driver.close()
return results
def get_clouds(word_list):
text = "".join(word_list)
wordlist = jieba.lcut(text) # 切割词语
space_list = ' '.join(wordlist) # 空格链接词语
# backgroud = np.array(Image.open('test1.jpg'))
wc = WordCloud(width=400, height=300,
background_color='white',
mode='RGB',
# mask=backgroud, # 添加蒙版,生成指定形状的词云,并且词云图的颜色可从蒙版里提取
max_words=200,
stopwords=STOPWORDS.update(('in', "of", "for")), # 内置的屏蔽词,并添加自己设置的词语
font_path='C:\Windows\Fonts\STZHONGS.ttf',
max_font_size=100,
relative_scaling=0.6, # 设置字体大小与词频的关联程度为0.4
random_state=50,
scale=2
).generate(space_list)
# image_color = ImageColorGenerator(backgroud) # 设置生成词云的颜色,如去掉这两行则字体为默认颜色
# wc.recolor(color_func=image_color)
return wc.to_array()
if __name__ == '__main__':
get_news_from_arxiv(5, "knowledge graph")