https://python.langchain.com/v0.2/docs/integrations/document_loaders/
Copy
%pip install --user -Uq langchain langchain_community pypdf pdf2image docx2txt pdfminer
webBaseLoader
https://python.langchain.com/v0.2/docs/integrations/document_loaders/web_base/)
Copy %pip install --user -Uq beautifulsoup4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://www.thisisgame.com/webzine/news/nboard/4/?n=189952")
data = loader.load()
print(data[0].page_content)
Copy loader = WebBaseLoader(["https://www.espn.com/", "https://google.com"])
docs = loader.load()
docs
Load multiple urls concurrently
Copy %pip install --user -Uq nest_asyncio
# fixes a bug with asyncio and jupyter
import nest_asyncio
nest_asyncio.apply()
Copy loader = WebBaseLoader(["https://www.espn.com/", "https://google.com"])
loader.requests_per_second = 1
docs = loader.aload()
docs
xml parser
Copy loader = WebBaseLoader(
"https://www.govinfo.gov/content/pkg/CFR-2018-title10-vol3/xml/CFR-2018-title10-vol3-sec431-86.xml"
)
loader.default_parser = "xml"
docs = loader.load()
docs
sitemap loader
Copy %pip install --user -Uq nest_asyncio
# fixes a bug with asyncio and jupyter
import nest_asyncio
nest_asyncio.apply()
Copy from langchain_community.document_loaders.sitemap import SitemapLoader
Copy sitemap_loader = SitemapLoader(
web_path="https://api.python.langchain.com/sitemap.xml")
docs = sitemap_loader.load()
Copy # Filtering sitemap URLs
loader = SitemapLoader(
web_path="https://api.python.langchain.com/sitemap.xml",
filter_urls=["https://api.python.langchain.com/en/latest"],
)
documents = loader.load()
Copy sitemap_loader = SitemapLoader(
web_path="https://beta.thisisgame.com/sitemap.xml",
filter_urls=[
"https://beta.thisisgame.com/articles/265823"
],
)
sitemap_loader.requests_per_second = 2
docs = sitemap_loader.load()
docs
์ด์ ๊ธ์ ์์ ํ์
Copy #filter url
sitemap_loader = SitemapLoader(
web_path="https://beta.thisisgame.com/sitemap.xml",
filter_urls=[
"https://beta.thisisgame.com/articles/265823"
],
)
sitemap_loader.requests_per_second = 2
docs = sitemap_loader.load()
docs
Add custom scraping rules
Copy %pip install --user -Uq beautifulsoup4
Copy from bs4 import BeautifulSoup
def remove_nav_and_header_elements(content: BeautifulSoup) -> str:
# Find all 'nav' and 'header' elements in the BeautifulSoup object
nav_elements = content.find_all("nav")
header_elements = content.find_all("header")
# Remove each 'nav' and 'header' element from the BeautifulSoup object
for element in nav_elements + header_elements:
element.decompose()
return str(content.get_text())
Copy loader = SitemapLoader(
"https://api.python.langchain.com/sitemap.xml",
filter_urls=["https://api.python.langchain.com/en/latest/"],
parsing_function=remove_nav_and_header_elements,
)
Copy docs = loader.load()
docs
Copy from bs4 import BeautifulSoup
def parse_page(soup):
header = soup.find("header")
footer = soup.find("footer")
if header:
header.decompose()
if footer:
footer.decompose()
return (
str(soup.get_text())
.replace("\n", " ")
.replace("\xa0", " ")
.replace("์งํ๊ฒ์๊ณ ํ์ ์ธ๊ฒ์ ์ด์์คํผ๋์ธ๊ธฐํ/ํน์ง์ฐ์ฌ/์นดํฐ๊ฐค๋ฌ๋ฆฌ์ปค๋ฎค๋ํฐ ๋ก๊ทธ์ธ๋ก๊ทธ์ธ", "")
)
pdf Document Loader
Copy from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/sample.pdf")
pages= loader.load_and_split()
Copy
print(pages[0].page_content)
MS Word Document Loader
https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_word/
API Reference:Docx2txtLoader (https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.word_document.Docx2txtLoader.html)
Copy %pip install --user -Uq docx2txt
from langchain_community.document_loaders import Docx2txtLoader
loader = Docx2txtLoader("data/family.docx")
data = loader.load()
data
Last updated 8 months ago