document-loader

https://python.langchain.com/v0.2/docs/integrations/document_loaders/


%pip install  --user -Uq  langchain langchain_community pypdf pdf2image docx2txt pdfminer

webBaseLoader

https://python.langchain.com/v0.2/docs/integrations/document_loaders/web_base/)

%pip install  --user -Uq beautifulsoup4

from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://www.thisisgame.com/webzine/news/nboard/4/?n=189952")

data = loader.load()

print(data[0].page_content)

loader = WebBaseLoader(["https://www.espn.com/", "https://google.com"])
docs = loader.load()
docs

Load multiple urls concurrently

%pip install  --user -Uq   nest_asyncio

# fixes a bug with asyncio and jupyter
import nest_asyncio

nest_asyncio.apply()

loader = WebBaseLoader(["https://www.espn.com/", "https://google.com"])
loader.requests_per_second = 1
docs = loader.aload()
docs

xml parser

loader = WebBaseLoader(
    "https://www.govinfo.gov/content/pkg/CFR-2018-title10-vol3/xml/CFR-2018-title10-vol3-sec431-86.xml"
)
loader.default_parser = "xml"
docs = loader.load()
docs

sitemap loader

%pip install --user -Uq  nest_asyncio

# fixes a bug with asyncio and jupyter
import nest_asyncio

nest_asyncio.apply()

from langchain_community.document_loaders.sitemap import SitemapLoader

sitemap_loader = SitemapLoader(
    web_path="https://api.python.langchain.com/sitemap.xml")
docs = sitemap_loader.load()

docs[0]

# Filtering sitemap URLs
loader = SitemapLoader(
    web_path="https://api.python.langchain.com/sitemap.xml",
    filter_urls=["https://api.python.langchain.com/en/latest"],
)
documents = loader.load()

documents[0]

sitemap_loader = SitemapLoader(
    web_path="https://beta.thisisgame.com/sitemap.xml",
    filter_urls=[
        "https://beta.thisisgame.com/articles/265823"
    ],
)

sitemap_loader.requests_per_second = 2

docs = sitemap_loader.load()
docs

이제 글을 수정하자

#filter url
sitemap_loader = SitemapLoader(
    web_path="https://beta.thisisgame.com/sitemap.xml",
    filter_urls=[
        "https://beta.thisisgame.com/articles/265823"
    ],
)

sitemap_loader.requests_per_second = 2

docs = sitemap_loader.load()
docs

Add custom scraping rules

%pip install --user  -Uq beautifulsoup4

from bs4 import BeautifulSoup


def remove_nav_and_header_elements(content: BeautifulSoup) -> str:
    # Find all 'nav' and 'header' elements in the BeautifulSoup object
    nav_elements = content.find_all("nav")
    header_elements = content.find_all("header")

    # Remove each 'nav' and 'header' element from the BeautifulSoup object
    for element in nav_elements + header_elements:
        element.decompose()

    return str(content.get_text())

loader = SitemapLoader(
    "https://api.python.langchain.com/sitemap.xml",
    filter_urls=["https://api.python.langchain.com/en/latest/"],
    parsing_function=remove_nav_and_header_elements,
)

docs = loader.load()
docs

from bs4 import BeautifulSoup


def parse_page(soup):
    header = soup.find("header")
    footer = soup.find("footer")
    if header:
        header.decompose()
    if footer:
        footer.decompose()
    return (
        str(soup.get_text())
        .replace("\n", " ")
        .replace("\xa0", " ")
        .replace("진행게임계 화제인게임 이슈오피니언기획/특집연재/카툰갤러리커뮤니티 로그인로그인", "")
    )

pdf Document Loader

from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("data/sample.pdf")
pages= loader.load_and_split()


print(pages[0].page_content)

MS Word Document Loader

https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_word/

API Reference:Docx2txtLoader (https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.word_document.Docx2txtLoader.html)

%pip install  --user -Uq   docx2txt

from langchain_community.document_loaders import Docx2txtLoader
loader = Docx2txtLoader("data/family.docx")
data = loader.load()
data

Previousrag Nextwebsite-loader

Last updated 1 year ago

Was this helpful?