Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion lib/compress/zstd_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -2328,6 +2328,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
for (i = 0; i < ZSTD_REP_NUM; ++i)
zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i];
}
printf("-----NEW BLOCK-----\n");
if (zc->externSeqStore.pos < zc->externSeqStore.size) {
assert(!zc->appliedParams.ldmParams.enableLdm);
/* Updates ldmSeqStore.pos */
Expand All @@ -2338,7 +2339,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
src, srcSize);
assert(zc->externSeqStore.pos <= zc->externSeqStore.size);
} else if (zc->appliedParams.ldmParams.enableLdm) {
rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0};
rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0, 0};

ldmSeqStore.seq = zc->ldmSequences;
ldmSeqStore.capacity = zc->maxNbLdmSequences;
Expand All @@ -2360,6 +2361,7 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize)
{ const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize;
ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize);
} }
printf("Finished BuildSeqStore()\n");
return ZSTDbss_compress;
}

Expand Down
34 changes: 21 additions & 13 deletions lib/compress/zstd_compress_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,26 @@ typedef struct {
U32 lowLimit; /* below that point, no more valid data */
} ZSTD_window_t;

typedef struct {
U32 offset;
U32 litLength;
U32 matchLength;
} rawSeq;

typedef struct {
rawSeq* seq; /* The start of the sequences */
size_t pos; /* The position where reading stopped. <= size. */
size_t size; /* The number of sequences. <= capacity. */
size_t capacity; /* The capacity starting from `seq` pointer */

U32 rangeFlag; /* If == 1, then members of this rawSeqStore represent different things:
* seq.matchLength == start of a match
* seq.litLength == end of a match
* capacity == reference start index of this ldm seq store
*/

} rawSeqStore_t;

typedef struct ZSTD_matchState_t ZSTD_matchState_t;
struct ZSTD_matchState_t {
ZSTD_window_t window; /* State for window round buffer management */
Expand All @@ -150,6 +170,7 @@ struct ZSTD_matchState_t {
* dedicated dictionary search structure.
*/
optState_t opt; /* optimal parser state */
rawSeqStore_t ldmSeqStore; /* raw seq store containing LDMs */
const ZSTD_matchState_t* dictMatchState;
ZSTD_compressionParameters cParams;
};
Expand Down Expand Up @@ -183,19 +204,6 @@ typedef struct {
U32 windowLog; /* Window log for the LDM */
} ldmParams_t;

typedef struct {
U32 offset;
U32 litLength;
U32 matchLength;
} rawSeq;

typedef struct {
rawSeq* seq; /* The start of the sequences */
size_t pos; /* The position where reading stopped. <= size. */
size_t size; /* The number of sequences. <= capacity. */
size_t capacity; /* The capacity starting from `seq` pointer */
} rawSeqStore_t;

typedef struct {
int collectSequences;
ZSTD_Sequence* seqStart;
Expand Down
83 changes: 83 additions & 0 deletions lib/compress/zstd_ldm.c
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,64 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
return sequence;
}

/**
* Converts the elements of a rawSeqStore into a series of ranges representing
* the beginning and end of a match. We store the start of a match in "litLength"
* and end of a match in "matchLength". So a rawSeqStore containing:
* (litLength: 1000, matchLength: 500)
* (litLength: 2000, matchLength: 1000)
* (litLength: 4000, matchLength: 1000)
*
* would be converted into:
*
* (matchStart: 1000, matchEnd: 1500)
* (matchStart: 3500, matchEnd: 4500)
* (matchStart: 8500, matchEnd: 9500)
*/

static void printSeqStore(rawSeqStore_t* rawSeqStore) {
printf("rawSeqStore: pos: %zu\n", rawSeqStore->pos);
for (int i = 0; i < rawSeqStore->size; ++i) {
printf("(of:%u ml:%u ll: %u)\n", rawSeqStore->seq[i].offset, rawSeqStore->seq[i].matchLength, rawSeqStore->seq[i].litLength);
}
}

static void adjustLdmSeqStore(rawSeqStore_t* rawSeqStore, int baseDiff) {
size_t i = 0;
for (i; i < rawSeqStore->size; ++i) {
rawSeqStore->seq[i].matchLength += (size_t)baseDiff;
rawSeqStore->seq[i].litLength += (size_t)baseDiff;
}
}

static void convertSeqStoreToRanges(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 seqStoreStartPos) {
if (rawSeqStore->size == 0)
return;
size_t i;
size_t currPos = 0;
printf("Conversion...\n");
rawSeqStore->rangeFlag = 1;
for(i = 0 ; i < rawSeqStore->size; ++i) {
size_t matchStart;
size_t matchEnd;
currPos += rawSeqStore->seq[i].litLength;
matchStart = currPos;
currPos += rawSeqStore->seq[i].matchLength;
matchEnd = currPos;
rawSeqStore->seq[i].matchLength = matchStart;
rawSeqStore->seq[i].litLength = matchEnd;
printf("(%u, %u)\n", matchStart, matchEnd);
}
printf("size:%u\n", rawSeqStore->size);
/* this is maybe a lil bit janky of a way to check for a multi-block seqstore */
if (rawSeqStore->seq[rawSeqStore->size - 1].litLength > srcSize) {
printf("SETTING RANGEFLAG TO 2\n");
rawSeqStore->rangeFlag = 2; /* Signifies that this is a seqstore that spans
multiple blocks. */
adjustLdmSeqStore(rawSeqStore, seqStoreStartPos);
}
}

size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
void const* src, size_t srcSize)
Expand All @@ -576,6 +634,31 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
/* Input positions */
BYTE const* ip = istart;

/* If compression strategy uses optimal parser, use LDMs only as candidates
* rather than accepting all LDMs and calling regular match finder on literal
* blocks in between.
*/
if (cParams->strategy >= ZSTD_btopt) {
printf("ldmSeqStore start idx: %u\n", (U32)(istart - ms->window.base));
size_t cLen;
if ((*rawSeqStore).rangeFlag == 0) {
/* only convert the rawSeqStore once, in case it spans multiple blocks */
printSeqStore(rawSeqStore);
convertSeqStoreToRanges(rawSeqStore, srcSize, (U32)(istart - ms->window.base)); /* sets rangeFlag to true */
}
(*rawSeqStore).capacity = (U32)(istart - ms->window.base);
const BYTE* const prevBase = (BYTE const*)ms->window.base;
ms->ldmSeqStore = *rawSeqStore;
cLen = blockCompressor(ms, seqStore, rep, src, srcSize);
if (prevBase != ms->window.base) {
int baseDiff = (int)(prevBase - ms->window.base);
printf("Bases were different, adjusting, diff = %d\n", baseDiff);
adjustLdmSeqStore(rawSeqStore, baseDiff);
printSeqStore(rawSeqStore);
}
return cLen;
}

DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize);
assert(rawSeqStore->pos <= rawSeqStore->size);
assert(rawSeqStore->size <= rawSeqStore->capacity);
Expand Down
160 changes: 158 additions & 2 deletions lib/compress/zstd_opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -763,8 +763,134 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6);
}
}
/*********************************
* LDM functions
*********************************/

/* TODO: Increment by SBI */
static void getNextLdm(U32* ldmStart, U32* ldmEnd, rawSeqStore_t* ldmSeqStore,
const U32 ldmSeqStoreStartPos, U32 currPosInBlock, U32 current, U32 startBlockIdx, U32 endBlockIdx) {
if (ldmSeqStore->pos >= ldmSeqStore->size - 1 || /* pos == size-1 means currLdm is the last one, should never fetch another */
ldmSeqStore->size == 0 || ldmSeqStore->rangeFlag == 0)
return;

if (ldmSeqStore->rangeFlag == 2) {
// No need to adjust if we have an absolute seq store
if (current > *ldmEnd) {
/* If our current pos is greater than current match end, we need to fetch a new match */
//printf("Getting next ldm: current: %u and currLdmEnd: %u\n", current, *ldmEnd);
//printf("Range before update: (%u, %u)\n", *ldmStart, *ldmEnd);
ldmSeqStore->pos++;
*ldmStart = ldmSeqStore->seq[ldmSeqStore->pos].matchLength;
*ldmEnd = ldmSeqStore->seq[ldmSeqStore->pos].litLength;
//printf("Newly fetched ldm: (%u, %u)\n", *ldmStart, *ldmEnd);
}
/* Handle match splitting, which only applies for multi-threaded cases
* If an LDM starts before the block ends, and ends after the block ends, we split the match into two.
* Don't increment pos so we stay on this match until it ends.
*/
if (*ldmStart < endBlockIdx && *ldmEnd > endBlockIdx) {
//printf("Splitting match @ end: ldmEnd: %u, endBlockIdx: %u\n", *ldmEnd, endBlockIdx);
*ldmEnd = endBlockIdx;
//printf("Range after split: (%u, %u)\n", *ldmStart, *ldmEnd);
}

if (*ldmStart < startBlockIdx && *ldmEnd > startBlockIdx) {
//printf("Splitting match @ start: ldmStart: %u, startBlockIdx: %u\n", *ldmStart, startBlockIdx);
*ldmStart = startBlockIdx;
//printf("Range after split: (%u, %u)\n", *ldmStart, *ldmEnd);
}

} else {
/* In this case, all of the LDMs are within this one block */
U32 ldmStartAdjusted = *ldmStart + startBlockIdx;
U32 ldmEndAdjusted = *ldmEnd + startBlockIdx;
if (current >= ldmEndAdjusted) {
//printf("Getting next raw ldm range at: current: %u with seqStore.pos: %u .size: %u\n", current, ldmSeqStore->pos, ldmSeqStore->size);
//printf("Current raw ldm range: (%u, %u) -> abs: (%u, %u)\n", *ldmStart, *ldmEnd, ldmStartAdjusted, ldmEndAdjusted);
ldmSeqStore->pos++;
*ldmStart = ldmSeqStore->seq[ldmSeqStore->pos].matchLength;
*ldmEnd = ldmSeqStore->seq[ldmSeqStore->pos].litLength;
//printf("New raw ldm range: (%u, %u) -> abs: (%u, %u) at pos: %u\n", *ldmStart, *ldmEnd, *ldmStart + startBlockIdx, *ldmEnd + startBlockIdx, ldmSeqStore->pos);
}
}
}

static void maybeAddLdm(const rawSeqStore_t* const ldmSeqStore, ZSTD_match_t* matches,
U32* nbMatches, U32 ldmStart, U32 ldmEnd, U32 current, U32 startBlockIdx) {
if (ldmSeqStore->size == 0)
return;
assert(ldmSeqStore->rangeFlag != 0);
/* Adjusted ldms for when the ldm seq store was calculated for this block only */
U32 ldmStartAdjusted = ldmSeqStore->rangeFlag == 1 ? ldmStart + startBlockIdx : ldmStart;
U32 ldmEndAdjusted = ldmSeqStore->rangeFlag == 1 ? ldmEnd + startBlockIdx : ldmEnd;

/* Current must be within the adjusted ldm */
if (ldmSeqStore->rangeFlag == 1) {
if (current < ldmStartAdjusted || current >= ldmEndAdjusted)
return;
} else if (ldmSeqStore->rangeFlag == 2) {
if (!(current >= ldmStartAdjusted) || !(current < ldmEndAdjusted))
return;
}

U32 originalMatchLength = ldmEndAdjusted - ldmStartAdjusted;
U32 posDifference = current - ldmStartAdjusted;
if (posDifference > 0 /* TODO: change */ || posDifference >= originalMatchLength /*- MINMATCH*/ /* underflow here if we do this? */) {
return;
}
printf("Original matchlen: %u - ", originalMatchLength);
printf("Considering LDM range (%u, %u) -> abs: (%u, %u) @ current = %u\n", ldmStart, ldmEnd, ldmStartAdjusted, ldmEndAdjusted, current);


U32 candidateOffCode = ldmSeqStore->seq[ldmSeqStore->pos].offset + posDifference + ZSTD_REP_MOVE;
U32 candidateMatchLength = originalMatchLength - posDifference;
if (candidateMatchLength < ZSTD_LDM_MINMATCH_MIN) {
//printf("too small\n");
return;
}
//printf("adjusted to (of(code): %u, ml %u)\n", candidateOffCode, candidateMatchLength);
if ((*nbMatches == 0 || candidateMatchLength >= matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM) {
printf("large enough, adding\n");
/*matches[*nbMatches].len = candidateMatchLength;
matches[*nbMatches].off = candidateOffCode;
(*nbMatches)++;*/
/* Add sifting */

if (*nbMatches == 0) {
matches[*nbMatches].len = candidateMatchLength;
matches[*nbMatches].off = candidateOffCode;
(*nbMatches)++;
} else {
if (candidateMatchLength == matches[*nbMatches-1].len) {
U32 candidateMatchIdx = *nbMatches;
matches[*nbMatches].len = candidateMatchLength;
matches[*nbMatches].off = candidateOffCode;
//printf("Sifting...: idx: %u, len: %u, off: %u\n", candidateMatchIdx, candidateMatchLength, candidateOffCode);
//printf("Current best is...: idx: %u, len: %u, off: %u\n", *nbMatches-1, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
if (candidateOffCode != matches[*nbMatches].off)
printf("DIFF: ldm: (len: %u, off: %u), best: (len: %u, off: %u)\n", candidateMatchLength, candidateOffCode, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
//printf("Current best is...: idx: %u, len: %u, off: %u\n", *nbMatches-1, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
while (candidateMatchIdx > 0 &&
matches[candidateMatchIdx].off > matches[candidateMatchIdx - 1].off &&
matches[candidateMatchIdx].len == matches[candidateMatchIdx - 1].len) {
//printf("Compared to: idx: %u, len: %u, off: %u", candidateMatchIdx - 1, matches[candidateMatchIdx - 1].len, matches[candidateMatchIdx - 1].off);
ZSTD_match_t tmp = matches[candidateMatchIdx - 1];
matches[candidateMatchIdx - 1] = matches[candidateMatchIdx];
matches[candidateMatchIdx] = tmp;
--candidateMatchIdx;
}
(*nbMatches)++;
} else {
printf("MATCHDIFF: ldm: (len: %u, off: %u), best: (len: %u, off: %u)\n", candidateMatchLength, candidateOffCode, matches[*nbMatches-1].len, matches[*nbMatches-1].off);
matches[*nbMatches].len = candidateMatchLength;
matches[*nbMatches].off = candidateOffCode;
(*nbMatches)++;
}
}
}
}

/*-*******************************
* Optimal parser
*********************************/
Expand Down Expand Up @@ -813,6 +939,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1);
U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4;
U32 nextToUpdate3 = ms->nextToUpdate;
U32 const startBlockIdx = (U32)(istart - base);
U32 const endBlockIdx = startBlockIdx + srcSize;

U32 currLdmStart;
U32 currLdmEnd;

ZSTD_optimal_t* const opt = optStatePtr->priceTable;
ZSTD_match_t* const matches = optStatePtr->matchTable;
Expand All @@ -821,9 +952,21 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
/* init */
DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u",
(U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate);
//printf("ZSTD_compressBlock_opt_generic: current=%u, sbi=%u, ebi=%u\n", (U32)(ip - base), startBlockIdx, endBlockIdx);
assert(optLevel <= 2);
ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel);
ip += (ip==prefixStart);
int ldmAdjusted = 0;

/* Set current LDM candidate to whatever might have been considered in prev block */
if (ms->ldmSeqStore.size != 0) {
size_t readIdx = ms->ldmSeqStore.pos == 0 ? 0 : ms->ldmSeqStore.pos - 1;
currLdmStart = ms->ldmSeqStore.seq[readIdx].matchLength;
currLdmEnd = ms->ldmSeqStore.seq[readIdx].litLength;
//printf("Starting opt with ldm : (%u, %u)\n", currLdmStart, currLdmEnd);
} else {
currLdmStart = currLdmEnd = 0;
}

/* Match Loop */
while (ip < ilimit) {
Expand All @@ -832,7 +975,12 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
/* find first match */
{ U32 const litlen = (U32)(ip - anchor);
U32 const ll0 = !litlen;
U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
U32 const current = (U32)(ip - base);

getNextLdm(&currLdmStart, &currLdmEnd, &ms->ldmSeqStore, ms->ldmSeqStore.capacity, (U32)(ip-istart), current, startBlockIdx, startBlockIdx + srcSize);
U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch);
maybeAddLdm(&ms->ldmSeqStore, matches, &nbMatches, currLdmStart, currLdmEnd, current, startBlockIdx);

if (!nbMatches) { ip++; continue; }

/* initialize opt[0] */
Expand Down Expand Up @@ -890,6 +1038,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
/* check further positions */
for (cur = 1; cur <= last_pos; cur++) {
const BYTE* const inr = ip + cur;
U32 const current = (U32)(inr - base);
assert(cur < ZSTD_OPT_NUM);
DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur)

Expand Down Expand Up @@ -945,7 +1094,13 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0;
U32 const previousPrice = opt[cur].price;
U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel);
U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);

/* Fetch next LDM if necessary */

getNextLdm(&currLdmStart, &currLdmEnd, &ms->ldmSeqStore, ms->ldmSeqStore.capacity, (U32)(inr-istart), current, startBlockIdx, startBlockIdx + srcSize);
U32 nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);
maybeAddLdm(&ms->ldmSeqStore, matches, &nbMatches, currLdmStart, currLdmEnd, current, startBlockIdx);

U32 matchNb;
if (!nbMatches) {
DEBUGLOG(7, "rPos:%u : no match found", cur);
Expand Down Expand Up @@ -1061,6 +1216,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
}
} /* while (ip < ilimit) */

//printf("Finished opt\n");
/* Return the last literals size */
return (size_t)(iend - anchor);
}
Expand Down
Loading