| """Base schema for readers.""" |
| from dataclasses import dataclass |
|
|
| from langchain.docstore.document import Document as LCDocument |
| from application.parser.schema.schema import BaseDocument |
|
|
|
|
| @dataclass |
| class Document(BaseDocument): |
| """Generic interface for a data document. |
| |
| This document connects to data sources. |
| |
| """ |
|
|
| def __post_init__(self) -> None: |
| """Post init.""" |
| if self.text is None: |
| raise ValueError("text field not set.") |
|
|
| @classmethod |
| def get_type(cls) -> str: |
| """Get Document type.""" |
| return "Document" |
|
|
| def to_langchain_format(self) -> LCDocument: |
| """Convert struct to LangChain document format.""" |
| metadata = self.extra_info or {} |
| return LCDocument(page_content=self.text, metadata=metadata) |
|
|
| @classmethod |
| def from_langchain_format(cls, doc: LCDocument) -> "Document": |
| """Convert struct from LangChain document format.""" |
| return cls(text=doc.page_content, extra_info=doc.metadata) |
|
|