Ritvik19 commited on
Commit
7793370
·
verified ·
1 Parent(s): 6ae5e8b

Upload process_documents.py

Browse files
Files changed (1) hide show
  1. process_documents.py +4 -1
process_documents.py CHANGED
@@ -10,12 +10,15 @@ deep_strip = lambda text: re.sub(r"\s+", " ", text or "").strip()
10
 
11
  def process_documents(urls):
12
  snippets = []
 
13
  for source_id, url in enumerate(urls):
14
  if url.endswith(".pdf"):
15
  snippets.extend(process_pdf(url, source_id))
 
16
  else:
17
  snippets.extend(process_web(url, source_id))
18
- return snippets
 
19
 
20
 
21
  def process_web(url, source_id):
 
10
 
11
  def process_documents(urls):
12
  snippets = []
13
+ documents = []
14
  for source_id, url in enumerate(urls):
15
  if url.endswith(".pdf"):
16
  snippets.extend(process_pdf(url, source_id))
17
+ documents.append("\n".join([snip.page_content for snip in snippets]))
18
  else:
19
  snippets.extend(process_web(url, source_id))
20
+ documents.append("\n".join([snip.page_content for snip in snippets]))
21
+ return snippets, documents
22
 
23
 
24
  def process_web(url, source_id):