zdict.h 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474
  1. /*
  2. * Copyright (c) Meta Platforms, Inc. and affiliates.
  3. * All rights reserved.
  4. *
  5. * This source code is licensed under both the BSD-style license (found in the
  6. * LICENSE file in the root directory of this source tree) and the GPLv2 (found
  7. * in the COPYING file in the root directory of this source tree).
  8. * You may select, at your option, one of the above-listed licenses.
  9. */
  10. #if defined (__cplusplus)
  11. extern "C" {
  12. #endif
  13. #ifndef ZSTD_ZDICT_H
  14. #define ZSTD_ZDICT_H
  15. /*====== Dependencies ======*/
  16. #include <stddef.h> /* size_t */
  17. /* ===== ZDICTLIB_API : control library symbols visibility ===== */
  18. #ifndef ZDICTLIB_VISIBLE
  19. /* Backwards compatibility with old macro name */
  20. # ifdef ZDICTLIB_VISIBILITY
  21. # define ZDICTLIB_VISIBLE ZDICTLIB_VISIBILITY
  22. # elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
  23. # define ZDICTLIB_VISIBLE __attribute__ ((visibility ("default")))
  24. # else
  25. # define ZDICTLIB_VISIBLE
  26. # endif
  27. #endif
  28. #ifndef ZDICTLIB_HIDDEN
  29. # if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__)
  30. # define ZDICTLIB_HIDDEN __attribute__ ((visibility ("hidden")))
  31. # else
  32. # define ZDICTLIB_HIDDEN
  33. # endif
  34. #endif
  35. #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
  36. # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBLE
  37. #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
  38. # define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/
  39. #else
  40. # define ZDICTLIB_API ZDICTLIB_VISIBLE
  41. #endif
  42. /*******************************************************************************
  43. * Zstd dictionary builder
  44. *
  45. * FAQ
  46. * ===
  47. * Why should I use a dictionary?
  48. * ------------------------------
  49. *
  50. * Zstd can use dictionaries to improve compression ratio of small data.
  51. * Traditionally small files don't compress well because there is very little
  52. * repetition in a single sample, since it is small. But, if you are compressing
  53. * many similar files, like a bunch of JSON records that share the same
  54. * structure, you can train a dictionary on ahead of time on some samples of
  55. * these files. Then, zstd can use the dictionary to find repetitions that are
  56. * present across samples. This can vastly improve compression ratio.
  57. *
  58. * When is a dictionary useful?
  59. * ----------------------------
  60. *
  61. * Dictionaries are useful when compressing many small files that are similar.
  62. * The larger a file is, the less benefit a dictionary will have. Generally,
  63. * we don't expect dictionary compression to be effective past 100KB. And the
  64. * smaller a file is, the more we would expect the dictionary to help.
  65. *
  66. * How do I use a dictionary?
  67. * --------------------------
  68. *
  69. * Simply pass the dictionary to the zstd compressor with
  70. * `ZSTD_CCtx_loadDictionary()`. The same dictionary must then be passed to
  71. * the decompressor, using `ZSTD_DCtx_loadDictionary()`. There are other
  72. * more advanced functions that allow selecting some options, see zstd.h for
  73. * complete documentation.
  74. *
  75. * What is a zstd dictionary?
  76. * --------------------------
  77. *
  78. * A zstd dictionary has two pieces: Its header, and its content. The header
  79. * contains a magic number, the dictionary ID, and entropy tables. These
  80. * entropy tables allow zstd to save on header costs in the compressed file,
  81. * which really matters for small data. The content is just bytes, which are
  82. * repeated content that is common across many samples.
  83. *
  84. * What is a raw content dictionary?
  85. * ---------------------------------
  86. *
  87. * A raw content dictionary is just bytes. It doesn't have a zstd dictionary
  88. * header, a dictionary ID, or entropy tables. Any buffer is a valid raw
  89. * content dictionary.
  90. *
  91. * How do I train a dictionary?
  92. * ----------------------------
  93. *
  94. * Gather samples from your use case. These samples should be similar to each
  95. * other. If you have several use cases, you could try to train one dictionary
  96. * per use case.
  97. *
  98. * Pass those samples to `ZDICT_trainFromBuffer()` and that will train your
  99. * dictionary. There are a few advanced versions of this function, but this
  100. * is a great starting point. If you want to further tune your dictionary
  101. * you could try `ZDICT_optimizeTrainFromBuffer_cover()`. If that is too slow
  102. * you can try `ZDICT_optimizeTrainFromBuffer_fastCover()`.
  103. *
  104. * If the dictionary training function fails, that is likely because you
  105. * either passed too few samples, or a dictionary would not be effective
  106. * for your data. Look at the messages that the dictionary trainer printed,
  107. * if it doesn't say too few samples, then a dictionary would not be effective.
  108. *
  109. * How large should my dictionary be?
  110. * ----------------------------------
  111. *
  112. * A reasonable dictionary size, the `dictBufferCapacity`, is about 100KB.
  113. * The zstd CLI defaults to a 110KB dictionary. You likely don't need a
  114. * dictionary larger than that. But, most use cases can get away with a
  115. * smaller dictionary. The advanced dictionary builders can automatically
  116. * shrink the dictionary for you, and select the smallest size that doesn't
  117. * hurt compression ratio too much. See the `shrinkDict` parameter.
  118. * A smaller dictionary can save memory, and potentially speed up
  119. * compression.
  120. *
  121. * How many samples should I provide to the dictionary builder?
  122. * ------------------------------------------------------------
  123. *
  124. * We generally recommend passing ~100x the size of the dictionary
  125. * in samples. A few thousand should suffice. Having too few samples
  126. * can hurt the dictionaries effectiveness. Having more samples will
  127. * only improve the dictionaries effectiveness. But having too many
  128. * samples can slow down the dictionary builder.
  129. *
  130. * How do I determine if a dictionary will be effective?
  131. * -----------------------------------------------------
  132. *
  133. * Simply train a dictionary and try it out. You can use zstd's built in
  134. * benchmarking tool to test the dictionary effectiveness.
  135. *
  136. * # Benchmark levels 1-3 without a dictionary
  137. * zstd -b1e3 -r /path/to/my/files
  138. * # Benchmark levels 1-3 with a dictionary
  139. * zstd -b1e3 -r /path/to/my/files -D /path/to/my/dictionary
  140. *
  141. * When should I retrain a dictionary?
  142. * -----------------------------------
  143. *
  144. * You should retrain a dictionary when its effectiveness drops. Dictionary
  145. * effectiveness drops as the data you are compressing changes. Generally, we do
  146. * expect dictionaries to "decay" over time, as your data changes, but the rate
  147. * at which they decay depends on your use case. Internally, we regularly
  148. * retrain dictionaries, and if the new dictionary performs significantly
  149. * better than the old dictionary, we will ship the new dictionary.
  150. *
  151. * I have a raw content dictionary, how do I turn it into a zstd dictionary?
  152. * -------------------------------------------------------------------------
  153. *
  154. * If you have a raw content dictionary, e.g. by manually constructing it, or
  155. * using a third-party dictionary builder, you can turn it into a zstd
  156. * dictionary by using `ZDICT_finalizeDictionary()`. You'll also have to
  157. * provide some samples of the data. It will add the zstd header to the
  158. * raw content, which contains a dictionary ID and entropy tables, which
  159. * will improve compression ratio, and allow zstd to write the dictionary ID
  160. * into the frame, if you so choose.
  161. *
  162. * Do I have to use zstd's dictionary builder?
  163. * -------------------------------------------
  164. *
  165. * No! You can construct dictionary content however you please, it is just
  166. * bytes. It will always be valid as a raw content dictionary. If you want
  167. * a zstd dictionary, which can improve compression ratio, use
  168. * `ZDICT_finalizeDictionary()`.
  169. *
  170. * What is the attack surface of a zstd dictionary?
  171. * ------------------------------------------------
  172. *
  173. * Zstd is heavily fuzz tested, including loading fuzzed dictionaries, so
  174. * zstd should never crash, or access out-of-bounds memory no matter what
  175. * the dictionary is. However, if an attacker can control the dictionary
  176. * during decompression, they can cause zstd to generate arbitrary bytes,
  177. * just like if they controlled the compressed data.
  178. *
  179. ******************************************************************************/
  180. /*! ZDICT_trainFromBuffer():
  181. * Train a dictionary from an array of samples.
  182. * Redirect towards ZDICT_optimizeTrainFromBuffer_fastCover() single-threaded, with d=8, steps=4,
  183. * f=20, and accel=1.
  184. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  185. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  186. * The resulting dictionary will be saved into `dictBuffer`.
  187. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  188. * or an error code, which can be tested with ZDICT_isError().
  189. * Note: Dictionary training will fail if there are not enough samples to construct a
  190. * dictionary, or if most of the samples are too small (< 8 bytes being the lower limit).
  191. * If dictionary training fails, you should use zstd without a dictionary, as the dictionary
  192. * would've been ineffective anyways. If you believe your samples would benefit from a dictionary
  193. * please open an issue with details, and we can look into it.
  194. * Note: ZDICT_trainFromBuffer()'s memory usage is about 6 MB.
  195. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  196. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  197. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
  198. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  199. */
  200. ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
  201. const void* samplesBuffer,
  202. const size_t* samplesSizes, unsigned nbSamples);
  203. typedef struct {
  204. int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */
  205. unsigned notificationLevel; /**< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
  206. unsigned dictID; /**< force dictID value; 0 means auto mode (32-bits random value)
  207. * NOTE: The zstd format reserves some dictionary IDs for future use.
  208. * You may use them in private settings, but be warned that they
  209. * may be used by zstd in a public dictionary registry in the future.
  210. * These dictionary IDs are:
  211. * - low range : <= 32767
  212. * - high range : >= (2^31)
  213. */
  214. } ZDICT_params_t;
  215. /*! ZDICT_finalizeDictionary():
  216. * Given a custom content as a basis for dictionary, and a set of samples,
  217. * finalize dictionary by adding headers and statistics according to the zstd
  218. * dictionary format.
  219. *
  220. * Samples must be stored concatenated in a flat buffer `samplesBuffer`,
  221. * supplied with an array of sizes `samplesSizes`, providing the size of each
  222. * sample in order. The samples are used to construct the statistics, so they
  223. * should be representative of what you will compress with this dictionary.
  224. *
  225. * The compression level can be set in `parameters`. You should pass the
  226. * compression level you expect to use in production. The statistics for each
  227. * compression level differ, so tuning the dictionary for the compression level
  228. * can help quite a bit.
  229. *
  230. * You can set an explicit dictionary ID in `parameters`, or allow us to pick
  231. * a random dictionary ID for you, but we can't guarantee no collisions.
  232. *
  233. * The dstDictBuffer and the dictContent may overlap, and the content will be
  234. * appended to the end of the header. If the header + the content doesn't fit in
  235. * maxDictSize the beginning of the content is truncated to make room, since it
  236. * is presumed that the most profitable content is at the end of the dictionary,
  237. * since that is the cheapest to reference.
  238. *
  239. * `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
  240. *
  241. * @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
  242. * or an error code, which can be tested by ZDICT_isError().
  243. * Note: ZDICT_finalizeDictionary() will push notifications into stderr if
  244. * instructed to, using notificationLevel>0.
  245. * NOTE: This function currently may fail in several edge cases including:
  246. * * Not enough samples
  247. * * Samples are uncompressible
  248. * * Samples are all exactly the same
  249. */
  250. ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
  251. const void* dictContent, size_t dictContentSize,
  252. const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
  253. ZDICT_params_t parameters);
  254. /*====== Helper functions ======*/
  255. ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
  256. ZDICTLIB_API size_t ZDICT_getDictHeaderSize(const void* dictBuffer, size_t dictSize); /* returns dict header size; returns a ZSTD error code on failure */
  257. ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode);
  258. ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
  259. #endif /* ZSTD_ZDICT_H */
  260. #if defined(ZDICT_STATIC_LINKING_ONLY) && !defined(ZSTD_ZDICT_H_STATIC)
  261. #define ZSTD_ZDICT_H_STATIC
  262. /* This can be overridden externally to hide static symbols. */
  263. #ifndef ZDICTLIB_STATIC_API
  264. # if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1)
  265. # define ZDICTLIB_STATIC_API __declspec(dllexport) ZDICTLIB_VISIBLE
  266. # elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1)
  267. # define ZDICTLIB_STATIC_API __declspec(dllimport) ZDICTLIB_VISIBLE
  268. # else
  269. # define ZDICTLIB_STATIC_API ZDICTLIB_VISIBLE
  270. # endif
  271. #endif
  272. /* ====================================================================================
  273. * The definitions in this section are considered experimental.
  274. * They should never be used with a dynamic library, as they may change in the future.
  275. * They are provided for advanced usages.
  276. * Use them only in association with static linking.
  277. * ==================================================================================== */
  278. #define ZDICT_DICTSIZE_MIN 256
  279. /* Deprecated: Remove in v1.6.0 */
  280. #define ZDICT_CONTENTSIZE_MIN 128
  281. /*! ZDICT_cover_params_t:
  282. * k and d are the only required parameters.
  283. * For others, value 0 means default.
  284. */
  285. typedef struct {
  286. unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
  287. unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
  288. unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
  289. unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
  290. double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (1.0), 1.0 when all samples are used for both training and testing */
  291. unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
  292. unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
  293. ZDICT_params_t zParams;
  294. } ZDICT_cover_params_t;
  295. typedef struct {
  296. unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */
  297. unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */
  298. unsigned f; /* log of size of frequency array : constraint: 0 < f <= 31 : 1 means default(20)*/
  299. unsigned steps; /* Number of steps : Only used for optimization : 0 means default (40) : Higher means more parameters checked */
  300. unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */
  301. double splitPoint; /* Percentage of samples used for training: Only used for optimization : the first nbSamples * splitPoint samples will be used to training, the last nbSamples * (1 - splitPoint) samples will be used for testing, 0 means default (0.75), 1.0 when all samples are used for both training and testing */
  302. unsigned accel; /* Acceleration level: constraint: 0 < accel <= 10, higher means faster and less accurate, 0 means default(1) */
  303. unsigned shrinkDict; /* Train dictionaries to shrink in size starting from the minimum size and selects the smallest dictionary that is shrinkDictMaxRegression% worse than the largest dictionary. 0 means no shrinking and 1 means shrinking */
  304. unsigned shrinkDictMaxRegression; /* Sets shrinkDictMaxRegression so that a smaller dictionary can be at worse shrinkDictMaxRegression% worse than the max dict size dictionary. */
  305. ZDICT_params_t zParams;
  306. } ZDICT_fastCover_params_t;
  307. /*! ZDICT_trainFromBuffer_cover():
  308. * Train a dictionary from an array of samples using the COVER algorithm.
  309. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  310. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  311. * The resulting dictionary will be saved into `dictBuffer`.
  312. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  313. * or an error code, which can be tested with ZDICT_isError().
  314. * See ZDICT_trainFromBuffer() for details on failure modes.
  315. * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte.
  316. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  317. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  318. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
  319. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  320. */
  321. ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_cover(
  322. void *dictBuffer, size_t dictBufferCapacity,
  323. const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples,
  324. ZDICT_cover_params_t parameters);
  325. /*! ZDICT_optimizeTrainFromBuffer_cover():
  326. * The same requirements as above hold for all the parameters except `parameters`.
  327. * This function tries many parameter combinations and picks the best parameters.
  328. * `*parameters` is filled with the best parameters found,
  329. * dictionary constructed with those parameters is stored in `dictBuffer`.
  330. *
  331. * All of the parameters d, k, steps are optional.
  332. * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
  333. * if steps is zero it defaults to its default value.
  334. * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  335. *
  336. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  337. * or an error code, which can be tested with ZDICT_isError().
  338. * On success `*parameters` contains the parameters selected.
  339. * See ZDICT_trainFromBuffer() for details on failure modes.
  340. * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread.
  341. */
  342. ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_cover(
  343. void* dictBuffer, size_t dictBufferCapacity,
  344. const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
  345. ZDICT_cover_params_t* parameters);
  346. /*! ZDICT_trainFromBuffer_fastCover():
  347. * Train a dictionary from an array of samples using a modified version of COVER algorithm.
  348. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  349. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  350. * d and k are required.
  351. * All other parameters are optional, will use default values if not provided
  352. * The resulting dictionary will be saved into `dictBuffer`.
  353. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  354. * or an error code, which can be tested with ZDICT_isError().
  355. * See ZDICT_trainFromBuffer() for details on failure modes.
  356. * Note: ZDICT_trainFromBuffer_fastCover() requires 6 * 2^f bytes of memory.
  357. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  358. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  359. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
  360. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  361. */
  362. ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_fastCover(void *dictBuffer,
  363. size_t dictBufferCapacity, const void *samplesBuffer,
  364. const size_t *samplesSizes, unsigned nbSamples,
  365. ZDICT_fastCover_params_t parameters);
  366. /*! ZDICT_optimizeTrainFromBuffer_fastCover():
  367. * The same requirements as above hold for all the parameters except `parameters`.
  368. * This function tries many parameter combinations (specifically, k and d combinations)
  369. * and picks the best parameters. `*parameters` is filled with the best parameters found,
  370. * dictionary constructed with those parameters is stored in `dictBuffer`.
  371. * All of the parameters d, k, steps, f, and accel are optional.
  372. * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8}.
  373. * if steps is zero it defaults to its default value.
  374. * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [50, 2000].
  375. * If f is zero, default value of 20 is used.
  376. * If accel is zero, default value of 1 is used.
  377. *
  378. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  379. * or an error code, which can be tested with ZDICT_isError().
  380. * On success `*parameters` contains the parameters selected.
  381. * See ZDICT_trainFromBuffer() for details on failure modes.
  382. * Note: ZDICT_optimizeTrainFromBuffer_fastCover() requires about 6 * 2^f bytes of memory for each thread.
  383. */
  384. ZDICTLIB_STATIC_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
  385. size_t dictBufferCapacity, const void* samplesBuffer,
  386. const size_t* samplesSizes, unsigned nbSamples,
  387. ZDICT_fastCover_params_t* parameters);
  388. typedef struct {
  389. unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
  390. ZDICT_params_t zParams;
  391. } ZDICT_legacy_params_t;
  392. /*! ZDICT_trainFromBuffer_legacy():
  393. * Train a dictionary from an array of samples.
  394. * Samples must be stored concatenated in a single flat buffer `samplesBuffer`,
  395. * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order.
  396. * The resulting dictionary will be saved into `dictBuffer`.
  397. * `parameters` is optional and can be provided with values set to 0 to mean "default".
  398. * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`)
  399. * or an error code, which can be tested with ZDICT_isError().
  400. * See ZDICT_trainFromBuffer() for details on failure modes.
  401. * Tips: In general, a reasonable dictionary has a size of ~ 100 KB.
  402. * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`.
  403. * In general, it's recommended to provide a few thousands samples, though this can vary a lot.
  404. * It's recommended that total size of all samples be about ~x100 times the target size of dictionary.
  405. * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0.
  406. */
  407. ZDICTLIB_STATIC_API size_t ZDICT_trainFromBuffer_legacy(
  408. void* dictBuffer, size_t dictBufferCapacity,
  409. const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
  410. ZDICT_legacy_params_t parameters);
  411. /* Deprecation warnings */
  412. /* It is generally possible to disable deprecation warnings from compiler,
  413. for example with -Wno-deprecated-declarations for gcc
  414. or _CRT_SECURE_NO_WARNINGS in Visual.
  415. Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */
  416. #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS
  417. # define ZDICT_DEPRECATED(message) /* disable deprecation warnings */
  418. #else
  419. # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
  420. # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */
  421. # define ZDICT_DEPRECATED(message) [[deprecated(message)]]
  422. # elif defined(__clang__) || (ZDICT_GCC_VERSION >= 405)
  423. # define ZDICT_DEPRECATED(message) __attribute__((deprecated(message)))
  424. # elif (ZDICT_GCC_VERSION >= 301)
  425. # define ZDICT_DEPRECATED(message) __attribute__((deprecated))
  426. # elif defined(_MSC_VER)
  427. # define ZDICT_DEPRECATED(message) __declspec(deprecated(message))
  428. # else
  429. # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler")
  430. # define ZDICT_DEPRECATED(message)
  431. # endif
  432. #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */
  433. ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead")
  434. ZDICTLIB_STATIC_API
  435. size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
  436. const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples);
  437. #endif /* ZSTD_ZDICT_H_STATIC */
  438. #if defined (__cplusplus)
  439. }
  440. #endif