glm4/composite_demo/src/utils.py

30 lines
742 B
Python

from langchain_community.document_loaders import PyMuPDFLoader
import docx
from pptx import Presentation
def extract_text(path):
return open(path, 'r').read()
def extract_pdf(path):
loader = PyMuPDFLoader(path)
data = loader.load()
data = [x.page_content for x in data]
content = '\n\n'.join(data)
return content
def extract_docx(path):
doc = docx.Document(path)
data = []
for paragraph in doc.paragraphs:
data.append(paragraph.text)
content = '\n\n'.join(data)
def extract_pptx(path):
prs = Presentation(path)
text = ""
for slide in prs.slides:
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text