fixed: Now Chunk Splitter: V3: Fine Deduplication is working fine again (#866).

This commit is contained in:
vorotamoroz
2026-05-19 11:26:35 +01:00
parent d6bf453a6d
commit ee30f6cd6d
4 changed files with 210 additions and 2 deletions

4
.gitignore vendored
View File

@@ -28,4 +28,6 @@ data.json
cov_profile/** cov_profile/**
coverage coverage
src/apps/cli/dist/* src/apps/cli/dist/*
_testdata/**
utils/bench/splitResults.csv

Submodule src/lib updated: a0af792b48...6abcea69eb

View File

@@ -3,6 +3,15 @@ Since 19th July, 2025 (beta1 in 0.25.0-beta1, 13th July, 2025)
The head note of 0.25 is now in [updates_old.md](https://github.com/vrtmrz/obsidian-livesync/blob/main/updates_old.md). Because 0.25 got a lot of updates, thankfully, compatibility is kept and we do not need breaking changes! In other words, when get enough stabled. The next version will be v1.0.0. Even though it my hope. The head note of 0.25 is now in [updates_old.md](https://github.com/vrtmrz/obsidian-livesync/blob/main/updates_old.md). Because 0.25 got a lot of updates, thankfully, compatibility is kept and we do not need breaking changes! In other words, when get enough stabled. The next version will be v1.0.0. Even though it my hope.
## Unreleased
19th May, 2026
### Fixed
- Now Chunk Splitter: `V3: Fine Deduplication` is working fine again (#866).
- It has some drawbacks, such as fewer chunks are generated. However, it makes less transfer and storage when the files are modified but not completely changed.
## 0.25.64 ## 0.25.64
17th May, 2026 17th May, 2026

View File

@@ -0,0 +1,197 @@
import { glob } from "glob";
import { resolve } from "node:path";
import { fileURLToPath } from "node:url";
import { promises as fs } from "node:fs";
import { isPlainText, shouldSplitAsPlainText } from "../../src/lib/src/string_and_binary/path";
import { splitPiecesRabinKarp } from "../../src/lib/src/string_and_binary/chunks";
import {
PREFERRED_BASE,
PREFERRED_JOURNAL_SYNC,
PREFERRED_SETTING_CLOUDANT,
PREFERRED_SETTING_SELF_HOSTED,
} from "../../src/lib/src/common/models/setting.const.preferred";
import { type ObsidianLiveSyncSettings, DEFAULT_SETTINGS, MAX_DOC_SIZE_BIN } from "../../src/lib/src/common/types";
async function blobFromString(content: string): Promise<Blob> {
return new Blob([content], { type: "text/plain" });
}
const preferred = PREFERRED_BASE;
const preferredJournal = PREFERRED_JOURNAL_SYNC;
const preferredCouchDB = PREFERRED_SETTING_SELF_HOSTED;
const preferredIBM = PREFERRED_SETTING_CLOUDANT;
function computeChunkSize(overlay: Partial<ObsidianLiveSyncSettings>) {
const settings = { ...DEFAULT_SETTINGS, ...overlay };
const maxChunkSize = Math.floor(MAX_DOC_SIZE_BIN * ((settings.customChunkSize || 0) * 1 + 1));
const pieceSize = maxChunkSize;
const minimumChunkSize = settings.minimumChunkSize;
return { pieceSize, minimumChunkSize };
}
async function testSplit(
splitPiecesRabinKarpFn: typeof splitPiecesRabinKarp,
content: Blob,
settingsOverlay: Partial<ObsidianLiveSyncSettings>
) {
const { pieceSize, minimumChunkSize } = computeChunkSize(settingsOverlay);
const isPlain = content.type === "text/plain";
const chunkGenerator = await splitPiecesRabinKarpFn(content, pieceSize, isPlain, minimumChunkSize);
const chunks = [] as string[];
for await (const chunk of chunkGenerator()) {
chunks.push(chunk);
}
// if there are few chunks, calculate average chunk size except the last chunk which can be smaller due to the way the algorithm works, especially for small files.
const averageChunkSize =
chunks.length > 1
? chunks.slice(0, -1).reduce((acc, chunk) => acc + chunk.length, 0) / (chunks.length - 1)
: chunks.reduce((acc, chunk) => acc + chunk.length, 0) / chunks.length;
const lastChunk = chunks[chunks.length - 1];
// compute minimum chunk size if the last chunk is not the smallest.
const nonLastChunkSizes = chunks.slice(0, -1).map((c) => c.length);
const minChunkSize = nonLastChunkSizes.length > 0 ? Math.min(...nonLastChunkSizes) : lastChunk.length;
const result = {
isPlain,
originalSize: content.size,
chunkCount: chunks.length,
totalLength: chunks.reduce((acc, chunk) => acc + chunk.length, 0),
averageChunkSize: averageChunkSize,
maxChunkSize: Math.max(...chunks.map((c) => c.length)),
minChunkSize: minChunkSize,
uniqueChunks: new Set(chunks).size,
chunks: chunks,
};
return result;
}
const __filename = fileURLToPath(import.meta.url);
const __dirname = resolve(__filename, "..");
async function loadFileAsBlob(filePath: string): Promise<Blob> {
if (shouldSplitAsPlainText(filePath)) {
const content = await fs.readFile(filePath, "utf-8");
return blobFromString(content);
} else {
const buffer = await fs.readFile(filePath);
return new Blob([buffer]);
}
}
const testProfiles = [
{ name: "CouchDB", settings: preferredCouchDB },
{ name: "IBM Cloudant", settings: preferredIBM },
{ name: "Journal Sync", settings: preferredJournal },
// { name: "Base", settings: preferred },
];
function modifyBlob(blob: Blob, position: number, insertText: string): Blob {
const before = blob.slice(0, position);
const after = blob.slice(position);
const insert = new Blob([insertText], { type: blob.type });
return new Blob([before, insert, after], { type: blob.type });
}
async function main() {
const results = [] as string[][];
console.log("directory:", __dirname);
const findPath = resolve(__dirname, "../../");
console.warn("CWD:", findPath);
let testFiles = await glob("**/*.*", {
cwd: findPath,
maxDepth: 20,
ignore: ["**/node_modules/**", "**/.obsidian/**", "**/dist/**", "**/build/**", "**/out/**"],
});
testFiles = testFiles.filter((file) => {
const ext = file.split(".").pop()?.toLowerCase() || "";
return ["md", "txt", "json", "csv", "png"].includes(ext);
});
const header = [
"Profile",
"Implementation",
"Edition",
"File",
"Mode",
"Original Size (bytes)",
"Chunk Count",
"Average Chunk Size",
"Max Chunk Size",
"Min Chunk Size",
"Unique Chunks",
"Shared Chunks",
"Savings",
"Newly added (count)",
"Newly consumed (bytes)",
];
for (const profile of testProfiles) {
console.log(`Testing profile: ${profile.name}`);
for (const fn of [splitPiecesRabinKarp]) {
const funcProfile = fn !== splitPiecesRabinKarp ? "Old" : "New";
console.log(`Testing function: ${funcProfile}`);
for (const file of testFiles) {
const filePath = resolve(findPath, file);
const isPlain = shouldSplitAsPlainText(filePath);
const content = await loadFileAsBlob(filePath);
console.log(`Testing file: ${file} (size: ${content.size} bytes)`);
const result = await testSplit(fn, content, profile.settings);
const chunkSizes = result.chunks.map((c) => c.length);
const savings = result.originalSize - chunkSizes.reduce((acc, size) => acc + size, 0);
// console.log(`Result for ${file}:`, result);
results.push([
`${profile.name}`,
funcProfile,
"original",
file,
isPlain ? "plain" : "binary",
content.size.toString(),
result.chunkCount.toString(),
result.averageChunkSize.toFixed(2),
result.maxChunkSize.toString(),
result.minChunkSize.toString(),
result.uniqueChunks.toString(),
"",
savings.toString(),
"",
"",
]);
// add editions (inserting "*") to content on head, 5%, middle, 95%, tail to see if it affects the chunking
const editions = [
{ name: "head", content: modifyBlob(content, 0, "*") },
{ name: "5%", content: modifyBlob(content, Math.floor(content.size * 0.05), "*") },
{ name: "middle", content: modifyBlob(content, Math.floor(content.size * 0.5), "*") },
{ name: "95%", content: modifyBlob(content, Math.floor(content.size * 0.95), "*") },
{ name: "tail", content: modifyBlob(content, content.size, "*") },
];
const baseChunks = result.chunks;
for (const edition of editions) {
console.log(`Testing edition: ${edition.name}`);
const editionResult = await testSplit(fn, edition.content, profile.settings);
const sharedChunks = editionResult.chunks.filter((chunk) => baseChunks.includes(chunk)).length;
const newChunks = editionResult.chunks.filter((chunk) => !baseChunks.includes(chunk));
const editionResultChunkLength = editionResult.chunks.map((c) => c.length);
// console.log(`Result for edition ${edition.name} of ${file}:`, editionResult);
const editionSavings =
editionResult.originalSize - editionResultChunkLength.reduce((acc, size) => acc + size, 0);
// newly added chunks size :
const newChunksSize = newChunks.reduce((acc, chunk) => acc + chunk.length, 0);
results.push([
`${profile.name}`,
funcProfile,
`${edition.name}`,
file,
isPlain ? "plain" : "binary",
edition.content.size.toString(),
editionResult.chunkCount.toString(),
editionResult.averageChunkSize.toFixed(2),
editionResult.maxChunkSize.toString(),
editionResult.minChunkSize.toString(),
editionResult.uniqueChunks.toString(),
sharedChunks.toString(),
editionSavings.toString(),
newChunks.length.toString(),
newChunksSize.toString(),
]);
}
}
}
}
results.unshift(header);
await fs.writeFile(resolve(__dirname, "splitResults.csv"), results.map((r) => r.join(",")).join("\n"));
}
main();