Skip to content

Commit

Permalink
rename to full (#1304)
Browse files Browse the repository at this point in the history
* rename to `full`

* add html parser
  • Loading branch information
emrgnt-cmplxty authored Oct 2, 2024
1 parent 91e3147 commit 4b020ef
Show file tree
Hide file tree
Showing 6 changed files with 11 additions and 5 deletions.
File renamed without changes.
File renamed without changes.
4 changes: 3 additions & 1 deletion py/core/parsers/text/html_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
class HTMLParser(AsyncParser[DataType]):
"""A parser for HTML data."""

async def ingest(self, data: DataType) -> AsyncGenerator[str, None]:
async def ingest(
self, data: DataType, *args, **kwargs
) -> AsyncGenerator[str, None]:
"""Ingest HTML data and yield text."""
soup = BeautifulSoup(data, "html.parser")
yield soup.get_text()
2 changes: 2 additions & 0 deletions py/core/providers/ingestion/unstructured/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
DocumentType.SVG: [parsers.ImageParser],
DocumentType.MP3: [parsers.AudioParser],
DocumentType.JSON: [parsers.JSONParser],
DocumentType.HTML: [parsers.HTMLParser],
}

IMAGE_TYPES = {
Expand Down Expand Up @@ -226,6 +227,7 @@ async def parse(
json={
"file_content": encoded_content, # Use encoded string
"ingestion_config": ingestion_config,
"filename": document.metadata.get("title", None),
},
timeout=3600, # Adjust timeout as needed
)
Expand Down
2 changes: 1 addition & 1 deletion py/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "poetry.core.masonry.api"
[tool.poetry]
name = "r2r"
readme = "README.md"
version = "3.1.53"
version = "3.2.00"

description = "SciPhi R2R"
authors = ["Owen Colegrove <owen@sciphi.ai>"]
Expand Down
8 changes: 5 additions & 3 deletions services/unstructured/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import logging
import os
from io import BytesIO
from typing import Dict, List
from typing import Dict, List, Optional

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
Expand All @@ -18,6 +18,7 @@
class PartitionRequestModel(BaseModel):
file_content: bytes
ingestion_config: Dict
filename: Optional[str] = None


class PartitionResponseModel(BaseModel):
Expand All @@ -29,10 +30,10 @@ class PartitionResponseModel(BaseModel):
)


def run_partition(file_content: str, ingestion_config: Dict) -> List[Dict]:
def run_partition(file_content: str, filename: str, ingestion_config: Dict) -> List[Dict]:
file_content_bytes = base64.b64decode(file_content)
file_io = BytesIO(file_content_bytes)
elements = partition(file=file_io, **ingestion_config)
elements = partition(file=file_io, file_filename=filename, **ingestion_config)
return [element.to_dict() for element in elements]


Expand All @@ -50,6 +51,7 @@ async def partition_endpoint(request: PartitionRequestModel):
executor,
run_partition,
request.file_content,
request.filename,
request.ingestion_config,
)
logger.info(f"Partitioning completed")
Expand Down

0 comments on commit 4b020ef

Please sign in to comment.