30 lines
742 B
Python
30 lines
742 B
Python
from langchain_community.document_loaders import PyMuPDFLoader
|
|
import docx
|
|
from pptx import Presentation
|
|
|
|
def extract_text(path):
|
|
return open(path, 'r').read()
|
|
|
|
def extract_pdf(path):
|
|
loader = PyMuPDFLoader(path)
|
|
data = loader.load()
|
|
data = [x.page_content for x in data]
|
|
content = '\n\n'.join(data)
|
|
return content
|
|
|
|
def extract_docx(path):
|
|
doc = docx.Document(path)
|
|
data = []
|
|
for paragraph in doc.paragraphs:
|
|
data.append(paragraph.text)
|
|
content = '\n\n'.join(data)
|
|
|
|
def extract_pptx(path):
|
|
prs = Presentation(path)
|
|
text = ""
|
|
for slide in prs.slides:
|
|
for shape in slide.shapes:
|
|
if hasattr(shape, "text"):
|
|
text += shape.text + "\n"
|
|
return text
|