Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Obsidian Sync for Large Vaults #1078

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 54 additions & 13 deletions src/interface/obsidian/src/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,11 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
lastSync = lastSync.size > 0 ? lastSync : new Map<TFile, number>();

// Add all files to index as multipart form data
const fileData = [];
let fileData = [];
let currentBatchSize = 0;
const MAX_BATCH_SIZE = 10 * 1024 * 1024; // 10MB max batch size
let currentBatch = [];

for (const file of files) {
// Only push files that have been modified since last sync if not regenerating
if (!regenerate && file.stat.mtime < (lastSync.get(file) ?? 0)) {
Expand All @@ -98,31 +102,68 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
const encoding = supportedBinaryFileTypes.includes(file.extension) ? "binary" : "utf8";
const mimeType = fileExtensionToMimeType(file.extension) + (encoding === "utf8" ? "; charset=UTF-8" : "");
const fileContent = encoding == 'binary' ? await vault.readBinary(file) : await vault.read(file);
fileData.push({ blob: new Blob([fileContent], { type: mimeType }), path: file.path });
const fileItem = { blob: new Blob([fileContent], { type: mimeType }), path: file.path };

// Check if adding this file would exceed batch size
const fileSize = (typeof fileContent === 'string') ? new Blob([fileContent]).size : fileContent.byteLength;
if (currentBatchSize + fileSize > MAX_BATCH_SIZE && currentBatch.length > 0) {
fileData.push(currentBatch);
currentBatch = [];
currentBatchSize = 0;
}

currentBatch.push(fileItem);
currentBatchSize += fileSize;
}

// Add any previously synced files to be deleted to multipart form data
// Add any previously synced files to be deleted to final batch
let filesToDelete: TFile[] = [];
for (const lastSyncedFile of lastSync.keys()) {
if (!files.includes(lastSyncedFile)) {
countOfFilesToDelete++;
let fileObj = new Blob([""], { type: filenameToMimeType(lastSyncedFile) });
fileData.push({ blob: fileObj, path: lastSyncedFile.path });
currentBatch.push({ blob: fileObj, path: lastSyncedFile.path });
filesToDelete.push(lastSyncedFile);
}
}

// Iterate through all indexable files in vault, 1000 at a time
let responses: string[] = [];
// Add final batch if not empty
if (currentBatch.length > 0) {
fileData.push(currentBatch);
}

// Delete all files of enabled content types first if regenerating
let error_message = null;
for (let i = 0; i < fileData.length; i += 1000) {
const filesGroup = fileData.slice(i, i + 1000);
const contentTypesToDelete = [];
if (regenerate) {
// Mark content types to delete based on user sync file type settings
if (setting.syncFileType.markdown) contentTypesToDelete.push('markdown');
if (setting.syncFileType.pdf) contentTypesToDelete.push('pdf');
if (setting.syncFileType.images) contentTypesToDelete.push('image');
}
for (const contentType of contentTypesToDelete) {
const response = await fetch(`${setting.khojUrl}/api/content/${contentType}?client=obsidian`, {
method: "DELETE",
headers: {
'Authorization': `Bearer ${setting.khojApiKey}`,
}
});
if (!response.ok) {
error_message = "❗️Failed to clear existing content index";
fileData = [];
}
}

// Iterate through all indexable files in vault, 10Mb batch at a time
let responses: string[] = [];
for (const batch of fileData) {
// Create multipart form data with all files in batch
const formData = new FormData();
const method = regenerate ? "PUT" : "PATCH";
filesGroup.forEach(fileItem => { formData.append('files', fileItem.blob, fileItem.path) });
// Call Khoj backend to update index with all markdown, pdf files
batch.forEach(fileItem => { formData.append('files', fileItem.blob, fileItem.path) });

// Call Khoj backend to sync index with updated files in vault
const response = await fetch(`${setting.khojUrl}/api/content?client=obsidian`, {
method: method,
method: "PATCH",
headers: {
'Authorization': `Bearer ${setting.khojApiKey}`,
},
Expand Down Expand Up @@ -167,7 +208,7 @@ export async function updateContentIndex(vault: Vault, setting: KhojSetting, las
error_message = `❗️Could not connect to Khoj server. Ensure you can connect to it.`;
break;
} else {
error_message = `❗️Failed to sync your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`;
error_message = `❗️Failed to sync all your content with Khoj server. Raise issue on Khoj Discord or Github\nError: ${response.statusText}`;
}
} else {
responses.push(await response.text());
Expand Down
23 changes: 23 additions & 0 deletions src/khoj/routers/api_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,29 @@ def get_content_types(request: Request, client: Optional[str] = None):
return list(configured_content_types & all_content_types)


@api_content.delete("/{content_type}", status_code=200)
Copy link
Member

@sabaimran sabaimran Jan 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This API spec conflicts with out other delete endpoint, as below:

@api_content.get("/{content_source}", response_model=List[str])

I'd suggest we just use the existing deletion endpoint, and send it with type computer rather than specifying file types. Ideally, we should differentiate with Obsidian here, but that might be a bigger discussion.

We could also extend the existing API to take in content_type and content_source as query parameters, rather than path paremeters.

@requires(["authenticated"])
async def delete_content_type(
request: Request,
content_type: SearchType,
client: Optional[str] = None,
):
user = request.user.object

await sync_to_async(EntryAdapters.delete_all_entries)(user, file_type=content_type)

update_telemetry_state(
request=request,
telemetry_type="api",
api="delete_content_config",
client=client,
metadata={"content_type": content_type},
)

enabled_content = await sync_to_async(EntryAdapters.get_unique_file_types)(user)
return {"status": "ok"}


@api_content.get("/files", response_model=Dict[str, str])
@requires(["authenticated"])
async def get_all_files(
Expand Down
Loading