Skip to content

Commit

Permalink
chore: code fix for pdf loader and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
d1pankarmedhi committed May 23, 2024
1 parent ffb1310 commit e315b13
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 5 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
/target

# resources
.resource

# files
main.rs
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ These are some chunking strategy examples:

- [Chunking by words](/examples/chunk_by_words.rs) - Chunk your documents/texts by number of words.
- [Chunking by characters](/examples/chunk_by_chars.rs) - Chunk your documents/text by number of characters.
- [Chunk PDF document](/examples/chunk_document.rs) - Chunk your pdf documents by words/characters.

Run them using the cargo command like:
```bash
Expand Down
4 changes: 2 additions & 2 deletions src/loader/pdf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ impl BaseLoader<Result<String>> for PDFLoader {
let pages = doc.get_pages();
for (i, _) in pages.iter().enumerate() {
let page_number = (i + 1) as u32;
let text = doc.extract_text(&[page_number]);
content.push_str(text.unwrap().as_str());
let text = doc.extract_text(&[page_number]).unwrap();
content.push_str(text.as_str());
}
Ok(content)
}
Expand Down
6 changes: 3 additions & 3 deletions tests/chunker_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ fn test_character_chunker() {
let input_text = loader
.load_from_file("tests/test_files/sample_doc.pdf")
.unwrap();
let chunk_size = 3000;
let chunk_size = 1000;
let overlap = 50;
let chunks = char_chunker
.chunk_text(&input_text, chunk_size, overlap)
.unwrap();
dbg!(chunks.len());
assert_eq!(58, chunks.len());
assert_eq!(5, chunks.len());
}

#[test]
Expand All @@ -35,5 +35,5 @@ fn test_word_chunker() {
.chunk_text(&input_text, chunk_size, overlap)
.unwrap();
dbg!(&chunks);
assert_eq!(11060, chunks[0].content.len());
assert_eq!(1627, chunks[0].content.len());
}
Binary file modified tests/test_files/sample_doc.pdf
Binary file not shown.

0 comments on commit e315b13

Please sign in to comment.