Spaces:
Build error
Build error
from transformers import PegasusForConditionalGeneration | |
from tokenizers_pegasus import PegasusTokenizer | |
import re | |
import pandas as pd | |
import jieba | |
import jieba.analyse as jay | |
import paddle | |
from pyecharts import options as opts | |
from pyecharts.charts import Funnel | |
paddle.enable_static() | |
# -*- coding: utf-8 -*- | |
ARTICLE_PATH = '2.xlsx' | |
CACHE_PATH = '' | |
def get_keywords_in_article(): | |
def key_word(data): | |
data = str(data).strip() | |
result = re.findall(pattern='pan>(.*?)</t', string=data, flags=re.S) | |
str_n = [] | |
for str_ in result: | |
str_ = re.sub(r'[0-9a-zA-Z“”]', '', str_) | |
if str_ == '': | |
continue | |
str_n.append(str_) | |
return str_n | |
excel_pd = pd.read_excel(ARTICLE_PATH, index_col=None, sheet_name=0) | |
# keyword_excel_pd = excel_pd['字段1'].values.tolist() | |
keyword_excel_list = [] | |
for fil in excel_pd['字段1'].values.tolist(): | |
keyword_excel_list.append(key_word(fil)) | |
keyword_excel_pd = pd.DataFrame(keyword_excel_list) | |
excel_pd['字段1'] = excel_pd['字段1'].map(key_word) | |
excel_pd.columns = ['期刊名称', '2022复合影响因子', '2022综合影响因子', '关键词'] | |
keyword_excel_new_pd = pd.concat([excel_pd, keyword_excel_pd.iloc[:, :20]], axis=1) | |
columns_list = [ | |
'期刊名称', | |
'2022复合影响因子', | |
'2022综合影响因子', | |
'关键词', | |
'关键词1', | |
'关键词2', | |
'关键词3', | |
'关键词4', | |
'关键词5', | |
'关键词6', | |
'关键词7', | |
'关键词8', | |
'关键词9', | |
'关键词10', | |
'关键词11', | |
'关键词12', | |
'关键词13', | |
'关键词14', | |
'关键词15', | |
'关键词16', | |
'关键词17', | |
'关键词18', | |
'关键词19', | |
'关键词20', | |
] | |
keyword_excel_new_pd.columns = columns_list | |
keyword_excel_new_pd.to_excel('3.xlsx') | |
excel_pd_1 = excel_pd.iloc[:, 3:].values.tolist() | |
new_list = [tokens for st in excel_pd_1 for tokens in st] | |
new_list = [tokens for st in new_list for tokens in st] | |
new_list = list(set(new_list)) | |
return new_list, keyword_excel_new_pd | |
def extract_tags(text): | |
new_list, keyword_excel_new_pd = get_keywords_in_article() | |
model = PegasusForConditionalGeneration.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese") | |
tokenizer = PegasusTokenizer.from_pretrained("IDEA-CCNL/Randeng-Pegasus-238M-Summary-Chinese") | |
inputs = tokenizer(text, max_length=1024, return_tensors="pt") | |
summary_ids = model.generate(inputs["input_ids"]) | |
data_decode = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] | |
print(data_decode) | |
for i in range(len(new_list)): | |
jieba.add_word(new_list[i]) | |
cut_result_str = jieba.cut(data_decode, use_paddle=True) | |
print("Paddle Mode: " + '/'.join(cut_result_str)) | |
tok_result = jay.extract_tags(data_decode, topK=1, withWeight=False) | |
# print(tok_result[0]) | |
# key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd['关键词2'].isin(['数字经济'])] | |
# print(key_word_search_result) | |
# empty_pd = pd.DataFrame() | |
cache_path = '4.csv' | |
# empty_list = [] | |
for i in range(1, 21): | |
key_word_search_result = keyword_excel_new_pd.loc[keyword_excel_new_pd[f'关键词{i}'].isin([str(tok_result[0])])] | |
key_word_search_result.to_csv(cache_path, mode='a+', columns=None) | |
pro_pd = pd.read_csv(cache_path) | |
pro_pd = pro_pd.dropna() | |
pro_new_pd = pro_pd.iloc[:, 1:] | |
pro_new_pd = pro_new_pd.drop_duplicates('2022综合影响因子') | |
pro_new_pd['2022复合影响因子'] = pro_new_pd['2022复合影响因子'].str[13:].astype('float64') | |
pro_new_pd.sort_values(by=['2022复合影响因子'], ascending=False, inplace=True) | |
pro_new_pd.to_excel('4.xlsx') | |
labels = [] | |
values_1 = [] | |
values_2 = [] | |
for index, row in pro_new_pd.iterrows(): | |
labels.append(row['期刊名称']) | |
values_1.append(row['2022复合影响因子']) | |
values_2.append(row['2022综合影响因子'].split(':')[1]) | |
# figure_get(labels, values_1) | |
return [labels, values_1] | |
def figure_get(labels, values_1): | |
c = ( | |
Funnel( | |
) | |
.add( | |
'根据您的论文所推荐期刊', | |
[list(z) for z in zip(labels[:10], values_1[:10])], | |
label_opts=opts.LabelOpts(position="inside", is_show=True), | |
) | |
.set_global_opts(title_opts=opts.TitleOpts(title="根据您的论文所推荐的期刊"), | |
legend_opts=opts.LegendOpts(is_show=False), | |
toolbox_opts=opts.ToolboxOpts(is_show=False), | |
) | |
.render("根据您的论文所推荐的期刊.html") | |
) | |