forked from TeamWiseFlow/wiseflow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_report.py
227 lines (192 loc) · 8.62 KB
/
get_report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import random
import re
import os
from core.backend import dashscope_llm
from docx import Document
from docx.oxml.ns import qn
from docx.shared import Pt, RGBColor
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from datetime import datetime
from general_utils import isChinesePunctuation
from general_utils import get_logger_level
from loguru import logger
from pb_api import PbTalker
project_dir = os.environ.get("PROJECT_DIR", "")
os.makedirs(project_dir, exist_ok=True)
logger_file = os.path.join(project_dir, 'backend_service.log')
dsw_log = get_logger_level()
logger.add(
logger_file,
level=dsw_log,
backtrace=True,
diagnose=True,
rotation="50 MB"
)
pb = PbTalker(logger)
# qwen-72b-chat支持最大30k输入,考虑prompt其他部分,content不应超过30000字符长度
# 如果换qwen-max(最大输入6k),这里就要换成6000,但这样很多文章不能分析了
# 本地部署模型(qwen-14b这里可能仅支持4k输入,可能根本这套模式就行不通)
max_input_tokens = 30000
role_config = pb.read(collection_name='roleplays', filter=f'activated=True')
_role_config_id = ''
if role_config:
character = role_config[0]['character']
report_type = role_config[0]['report_type']
_role_config_id = role_config[0]['id']
else:
character, report_type = '', ''
if not character:
character = input('\033[0;32m 请为首席情报官指定角色设定(eg. 来自中国的网络安全情报专家):\033[0m\n')
_role_config_id = pb.add(collection_name='roleplays', body={'character': character, 'activated': True})
if not _role_config_id:
raise Exception('pls check pb data无法获取角色设定')
if not report_type:
report_type = input('\033[0;32m 请为首席情报官指定报告类型(eg. 网络安全情报):\033[0m\n')
_ = pb.update(collection_name='roleplays', id=_role_config_id, body={'report_type': report_type})
def get_report(insigt: str, articles: list[dict], memory: str, topics: list[str], comment: str, docx_file: str) -> (bool, str):
zh_index = ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '十一', '十二']
if isChinesePunctuation(insigt[-1]):
insigt = insigt[:-1]
# 分离段落和标题
if len(topics) == 0:
title = ''
elif len(topics) == 1:
title = topics[0]
topics = []
else:
title = topics[0]
topics = [s.strip() for s in topics[1:] if s.strip()]
schema = f'【标题】{title}\n\n【综述】\n\n'
if topics:
for i in range(len(topics)):
schema += f'【{zh_index[i]}、{topics[i]}】\n\n'
# 先判断是否是修改要求(有原文和评论,且原文的段落要求与给到的topics一致)
system_prompt, user_prompt = '', ''
if memory and comment:
paragraphs = re.findall("、(.*?)】", memory)
if set(topics) <= set(paragraphs):
logger.debug("no change in Topics, need modified the report")
system_prompt = f'''你是一名{character},你近日向上级提交了一份{report_type}报告,如下是报告原文。接下来你将收到来自上级部门的修改意见,请据此修改你的报告:
报告原文:
"""{memory}"""
'''
user_prompt = f'上级部门修改意见:"""{comment}"""'
if not system_prompt or not user_prompt:
logger.debug("need generate the report")
texts = ''
for article in articles:
if article['content']:
texts += f"<article>{article['content']}</article>\n"
else:
if article['abstract']:
texts += f"<article>{article['abstract']}</article>\n"
else:
texts += f"<article>{article['title']}</article>\n"
if len(texts) > max_input_tokens:
break
logger.debug(f"articles context length: {len(texts)}")
system_prompt = f'''你是一名{character},在近期的工作中我们从所关注的网站中发现了一条重要的{report_type}线索,线索和相关文章(用XML标签分隔)如下:
情报线索: """{insigt} """
相关文章:
{texts}
现在请基于这些信息按要求输出专业的书面报告。'''
if comment:
user_prompt = (f'1、不管原始资料是什么语言,你必须使用简体中文输出报告,除非是人名、组织和机构的名称、缩写;'
f'2、对事实的陈述务必基于所提供的相关文章,绝对不可以臆想;3、{comment}。\n')
else:
user_prompt = ('1、不管原始资料是什么语言,你必须使用简体中文输出报告,除非是人名、组织和机构的名称、缩写;'
'2、对事实的陈述务必基于所提供的相关文章,绝对不可以臆想。')
user_prompt += f'\n请按如下格式输出你的报告:\n{schema}'
# 生成阶段
check_flag = False
check_list = schema.split('\n\n')
check_list = [_[1:] for _ in check_list if _.startswith('【')]
result = ''
for i in range(2):
result = dashscope_llm([{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_prompt}],
'qwen1.5-72b-chat', seed=random.randint(1, 10000), logger=logger)
logger.debug(f"raw result:\n{result}")
if len(result) > 50:
check_flag = True
for check_item in check_list[2:]:
if check_item not in result:
check_flag = False
break
if check_flag:
break
logger.debug("result not good, re-generating...")
if not check_flag:
# 这里其实存在两种情况,一个是llm失效,一个是多次尝试后生成结果还是不行
if not result:
logger.warning('report-process-error: LLM out of work!')
return False, ''
else:
logger.warning('report-process-error: cannot generate, change topics and insight, then re-try')
return False, ''
# parse process
contents = result.split("【")
bodies = {}
for text in contents:
for item in check_list:
if text.startswith(item):
check_list.remove(item)
key, value = text.split("】")
value = value.strip()
if isChinesePunctuation(value[0]):
value = value[1:]
bodies[key] = value.strip()
break
if not bodies:
logger.warning('report-process-error: cannot generate, change topics and insight, then re-try')
return False, ''
if '标题' not in bodies:
if "】" in contents[0]:
_title = contents[0].split("】")[0]
bodies['标题'] = _title.strip()
else:
if len(contents) > 1 and "】" in contents[1]:
_title = contents[0].split("】")[0]
bodies['标题'] = _title.strip()
else:
bodies['标题'] = ""
doc = Document()
doc.styles['Normal'].font.name = u'宋体'
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'宋体')
doc.styles['Normal'].font.size = Pt(12)
doc.styles['Normal'].font.color.rgb = RGBColor(0, 0, 0)
# 先写好标题和摘要
if not title:
title = bodies['标题']
Head = doc.add_heading(level=1)
Head.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
run = Head.add_run(title)
run.font.name = u'Cambria'
run.font.color.rgb = RGBColor(0, 0, 0)
run._element.rPr.rFonts.set(qn('w:eastAsia'), u'Cambria')
doc.add_paragraph(
f"\n生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
del bodies['标题']
if '综述' in bodies:
doc.add_paragraph(f"\t{bodies['综述']}\n")
del bodies['综述']
# 逐段添加章节
for key, value in bodies.items():
Head = doc.add_heading(level=2)
run = Head.add_run(key)
run.font.name = u'Cambria'
run.font.color.rgb = RGBColor(0, 0, 0)
doc.add_paragraph(f"{value}\n")
# 添加附件引用信息源
Head = doc.add_heading(level=2)
run = Head.add_run("附:原始信息网页")
run.font.name = u'Cambria'
run.font.color.rgb = RGBColor(0, 0, 0)
contents = []
for i, article in enumerate(articles):
date_text = str(article['publish_time'])
if len(date_text) == 8:
date_text = f"{date_text[:4]}-{date_text[4:6]}-{date_text[6:]}"
contents.append(f"{i+1}、{article['title']}|{date_text}\n{article['url']} ")
doc.add_paragraph("\n\n".join(contents))
doc.save(docx_file)
return True, result[result.find("【"):]