From ee30f6cd6df8d4e2510d45c9925d1ddfc4fd59ce Mon Sep 17 00:00:00 2001 From: vorotamoroz Date: Tue, 19 May 2026 11:26:35 +0100 Subject: [PATCH] fixed: Now Chunk Splitter: `V3: Fine Deduplication` is working fine again (#866). --- .gitignore | 4 +- src/lib | 2 +- updates.md | 9 ++ utils/bench/splitPiecesRabinKarp.ts | 197 ++++++++++++++++++++++++++++ 4 files changed, 210 insertions(+), 2 deletions(-) create mode 100644 utils/bench/splitPiecesRabinKarp.ts diff --git a/.gitignore b/.gitignore index 43e267e..a6bced9 100644 --- a/.gitignore +++ b/.gitignore @@ -28,4 +28,6 @@ data.json cov_profile/** coverage -src/apps/cli/dist/* \ No newline at end of file +src/apps/cli/dist/* +_testdata/** +utils/bench/splitResults.csv \ No newline at end of file diff --git a/src/lib b/src/lib index a0af792..6abcea6 160000 --- a/src/lib +++ b/src/lib @@ -1 +1 @@ -Subproject commit a0af792b48e6e7a5b14d7ee932b81796b65bd497 +Subproject commit 6abcea69eb929ea261308b543ac42cd54a00eee2 diff --git a/updates.md b/updates.md index 8a4e9e6..bc34fb3 100644 --- a/updates.md +++ b/updates.md @@ -3,6 +3,15 @@ Since 19th July, 2025 (beta1 in 0.25.0-beta1, 13th July, 2025) The head note of 0.25 is now in [updates_old.md](https://github.com/vrtmrz/obsidian-livesync/blob/main/updates_old.md). Because 0.25 got a lot of updates, thankfully, compatibility is kept and we do not need breaking changes! In other words, when get enough stabled. The next version will be v1.0.0. Even though it my hope. +## Unreleased + +19th May, 2026 + +### Fixed +- Now Chunk Splitter: `V3: Fine Deduplication` is working fine again (#866). + - It has some drawbacks, such as fewer chunks are generated. However, it makes less transfer and storage when the files are modified but not completely changed. + + ## 0.25.64 17th May, 2026 diff --git a/utils/bench/splitPiecesRabinKarp.ts b/utils/bench/splitPiecesRabinKarp.ts new file mode 100644 index 0000000..1c4642c --- /dev/null +++ b/utils/bench/splitPiecesRabinKarp.ts @@ -0,0 +1,197 @@ +import { glob } from "glob"; +import { resolve } from "node:path"; +import { fileURLToPath } from "node:url"; +import { promises as fs } from "node:fs"; +import { isPlainText, shouldSplitAsPlainText } from "../../src/lib/src/string_and_binary/path"; +import { splitPiecesRabinKarp } from "../../src/lib/src/string_and_binary/chunks"; +import { + PREFERRED_BASE, + PREFERRED_JOURNAL_SYNC, + PREFERRED_SETTING_CLOUDANT, + PREFERRED_SETTING_SELF_HOSTED, +} from "../../src/lib/src/common/models/setting.const.preferred"; +import { type ObsidianLiveSyncSettings, DEFAULT_SETTINGS, MAX_DOC_SIZE_BIN } from "../../src/lib/src/common/types"; + +async function blobFromString(content: string): Promise { + return new Blob([content], { type: "text/plain" }); +} + +const preferred = PREFERRED_BASE; +const preferredJournal = PREFERRED_JOURNAL_SYNC; +const preferredCouchDB = PREFERRED_SETTING_SELF_HOSTED; +const preferredIBM = PREFERRED_SETTING_CLOUDANT; + +function computeChunkSize(overlay: Partial) { + const settings = { ...DEFAULT_SETTINGS, ...overlay }; + const maxChunkSize = Math.floor(MAX_DOC_SIZE_BIN * ((settings.customChunkSize || 0) * 1 + 1)); + const pieceSize = maxChunkSize; + + const minimumChunkSize = settings.minimumChunkSize; + return { pieceSize, minimumChunkSize }; +} + +async function testSplit( + splitPiecesRabinKarpFn: typeof splitPiecesRabinKarp, + content: Blob, + settingsOverlay: Partial +) { + const { pieceSize, minimumChunkSize } = computeChunkSize(settingsOverlay); + const isPlain = content.type === "text/plain"; + const chunkGenerator = await splitPiecesRabinKarpFn(content, pieceSize, isPlain, minimumChunkSize); + const chunks = [] as string[]; + for await (const chunk of chunkGenerator()) { + chunks.push(chunk); + } + // if there are few chunks, calculate average chunk size except the last chunk which can be smaller due to the way the algorithm works, especially for small files. + const averageChunkSize = + chunks.length > 1 + ? chunks.slice(0, -1).reduce((acc, chunk) => acc + chunk.length, 0) / (chunks.length - 1) + : chunks.reduce((acc, chunk) => acc + chunk.length, 0) / chunks.length; + const lastChunk = chunks[chunks.length - 1]; + // compute minimum chunk size if the last chunk is not the smallest. + const nonLastChunkSizes = chunks.slice(0, -1).map((c) => c.length); + const minChunkSize = nonLastChunkSizes.length > 0 ? Math.min(...nonLastChunkSizes) : lastChunk.length; + const result = { + isPlain, + originalSize: content.size, + chunkCount: chunks.length, + totalLength: chunks.reduce((acc, chunk) => acc + chunk.length, 0), + averageChunkSize: averageChunkSize, + maxChunkSize: Math.max(...chunks.map((c) => c.length)), + minChunkSize: minChunkSize, + uniqueChunks: new Set(chunks).size, + chunks: chunks, + }; + return result; +} +const __filename = fileURLToPath(import.meta.url); +const __dirname = resolve(__filename, ".."); +async function loadFileAsBlob(filePath: string): Promise { + if (shouldSplitAsPlainText(filePath)) { + const content = await fs.readFile(filePath, "utf-8"); + return blobFromString(content); + } else { + const buffer = await fs.readFile(filePath); + return new Blob([buffer]); + } +} +const testProfiles = [ + { name: "CouchDB", settings: preferredCouchDB }, + { name: "IBM Cloudant", settings: preferredIBM }, + { name: "Journal Sync", settings: preferredJournal }, + // { name: "Base", settings: preferred }, +]; +function modifyBlob(blob: Blob, position: number, insertText: string): Blob { + const before = blob.slice(0, position); + const after = blob.slice(position); + const insert = new Blob([insertText], { type: blob.type }); + return new Blob([before, insert, after], { type: blob.type }); +} +async function main() { + const results = [] as string[][]; + console.log("directory:", __dirname); + const findPath = resolve(__dirname, "../../"); + console.warn("CWD:", findPath); + let testFiles = await glob("**/*.*", { + cwd: findPath, + maxDepth: 20, + ignore: ["**/node_modules/**", "**/.obsidian/**", "**/dist/**", "**/build/**", "**/out/**"], + }); + testFiles = testFiles.filter((file) => { + const ext = file.split(".").pop()?.toLowerCase() || ""; + return ["md", "txt", "json", "csv", "png"].includes(ext); + }); + const header = [ + "Profile", + "Implementation", + "Edition", + "File", + "Mode", + "Original Size (bytes)", + "Chunk Count", + "Average Chunk Size", + "Max Chunk Size", + "Min Chunk Size", + "Unique Chunks", + "Shared Chunks", + "Savings", + "Newly added (count)", + "Newly consumed (bytes)", + ]; + for (const profile of testProfiles) { + console.log(`Testing profile: ${profile.name}`); + for (const fn of [splitPiecesRabinKarp]) { + const funcProfile = fn !== splitPiecesRabinKarp ? "Old" : "New"; + console.log(`Testing function: ${funcProfile}`); + for (const file of testFiles) { + const filePath = resolve(findPath, file); + const isPlain = shouldSplitAsPlainText(filePath); + const content = await loadFileAsBlob(filePath); + console.log(`Testing file: ${file} (size: ${content.size} bytes)`); + const result = await testSplit(fn, content, profile.settings); + const chunkSizes = result.chunks.map((c) => c.length); + const savings = result.originalSize - chunkSizes.reduce((acc, size) => acc + size, 0); + // console.log(`Result for ${file}:`, result); + results.push([ + `${profile.name}`, + funcProfile, + "original", + file, + isPlain ? "plain" : "binary", + content.size.toString(), + result.chunkCount.toString(), + result.averageChunkSize.toFixed(2), + result.maxChunkSize.toString(), + result.minChunkSize.toString(), + result.uniqueChunks.toString(), + "", + savings.toString(), + "", + "", + ]); + // add editions (inserting "*") to content on head, 5%, middle, 95%, tail to see if it affects the chunking + const editions = [ + { name: "head", content: modifyBlob(content, 0, "*") }, + { name: "5%", content: modifyBlob(content, Math.floor(content.size * 0.05), "*") }, + { name: "middle", content: modifyBlob(content, Math.floor(content.size * 0.5), "*") }, + { name: "95%", content: modifyBlob(content, Math.floor(content.size * 0.95), "*") }, + { name: "tail", content: modifyBlob(content, content.size, "*") }, + ]; + const baseChunks = result.chunks; + for (const edition of editions) { + console.log(`Testing edition: ${edition.name}`); + const editionResult = await testSplit(fn, edition.content, profile.settings); + const sharedChunks = editionResult.chunks.filter((chunk) => baseChunks.includes(chunk)).length; + const newChunks = editionResult.chunks.filter((chunk) => !baseChunks.includes(chunk)); + const editionResultChunkLength = editionResult.chunks.map((c) => c.length); + // console.log(`Result for edition ${edition.name} of ${file}:`, editionResult); + const editionSavings = + editionResult.originalSize - editionResultChunkLength.reduce((acc, size) => acc + size, 0); + // newly added chunks size : + const newChunksSize = newChunks.reduce((acc, chunk) => acc + chunk.length, 0); + results.push([ + `${profile.name}`, + funcProfile, + `${edition.name}`, + file, + isPlain ? "plain" : "binary", + edition.content.size.toString(), + editionResult.chunkCount.toString(), + editionResult.averageChunkSize.toFixed(2), + editionResult.maxChunkSize.toString(), + editionResult.minChunkSize.toString(), + editionResult.uniqueChunks.toString(), + sharedChunks.toString(), + editionSavings.toString(), + newChunks.length.toString(), + newChunksSize.toString(), + ]); + } + } + } + } + + results.unshift(header); + await fs.writeFile(resolve(__dirname, "splitResults.csv"), results.map((r) => r.join(",")).join("\n")); +} +main();