diff --git a/Cargo.lock b/Cargo.lock index 4728b39..85f66fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -123,9 +123,9 @@ dependencies = [ [[package]] name = "actix-rt" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eda4e2a6e042aa4e55ac438a2ae052d3b5da0ecf83d7411e1a368946925208" +checksum = "92589714878ca59a7626ea19734f0e07a6a875197eec751bb5d3f99e64998c63" dependencies = [ "futures-core", "tokio", @@ -324,7 +324,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -335,7 +335,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -455,9 +455,9 @@ checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" [[package]] name = "bytemuck" -version = "1.24.0" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" [[package]] name = "byteorder" @@ -467,15 +467,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b35204fbdc0b3f4446b89fc1ac2cf84a8a68971995d0bf2e925ec7cd960f9cb3" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" [[package]] name = "bytestring" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e465647ae23b2823b0753f50decb2d5a86d2bb2cac04788fafd1f80e45378e5f" +checksum = "113b4343b5f6617e7ad401ced8de3cc8b012e73a594347c307b90db3e9271289" dependencies = [ "bytes", ] @@ -523,9 +523,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.54" +version = "1.2.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583" +checksum = "47b26a0954ae34af09b50f0de26458fa95369a0d478d8236d3f93082b219bd29" dependencies = [ "find-msvc-tools", "jobserver", @@ -615,18 +615,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.54" +version = "4.5.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6e6ff9dcd79cff5cd969a17a545d79e84ab086e444102a591e288a8aa3ce394" +checksum = "6899ea499e3fb9305a65d5ebf6e3d2248c5fab291f300ad0a704fbe142eae31a" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.54" +version = "4.5.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa42cf4d2b7a41bc8f663a7cab4031ebafa1bf3875705bfaf8466dc60ab52c00" +checksum = "7b12c8b680195a62a8364d16b8447b01b6c2c8f9aaf68bee653be34d4245e238" dependencies = [ "anstyle", "clap_lex", @@ -932,7 +932,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1027,7 +1027,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -1062,15 +1062,15 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "flate2" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", @@ -1178,9 +1178,9 @@ dependencies = [ [[package]] name = "gif" -version = "0.13.1" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb2d69b19215e18bb912fa30f7ce15846e301408695e44e0ef719f1da9e19f2" +checksum = "4ae047235e33e2829703574b54fdec96bfbad892062d97fed2f76022287de61b" dependencies = [ "color_quant", "weezl", @@ -1491,9 +1491,9 @@ dependencies = [ [[package]] name = "jpeg-decoder" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f5d4a7da358eff58addd2877a45865158f0d78c911d43a5784ceb7bbf52833b0" +checksum = "00810f1d8b74be64b13dbf3db89ac67740615d6c891f0e7b6179326533011a07" [[package]] name = "js-sys" @@ -1559,19 +1559,19 @@ checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" [[package]] name = "libloading" -version = "0.8.6" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34" +checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-link", ] [[package]] name = "libredox" -version = "0.1.3" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" dependencies = [ "bitflags 2.10.0", "libc", @@ -1757,9 +1757,9 @@ dependencies = [ [[package]] name = "num-conv" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" [[package]] name = "num-traits" @@ -1856,7 +1856,6 @@ dependencies = [ "env_logger", "futures-util", "infer", - "lazy_static", "mime", "mime_guess", "num_cpus", @@ -1976,15 +1975,15 @@ checksum = "60f6ce597ecdcc9a098e7fddacb1065093a3d66446fa16c675e7e71d1b5c28e6" [[package]] name = "portable-atomic" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +checksum = "7a9db96d7fa8782dd8c15ce32ffe8680bbd1e978a43bf51a34d39483540495f5" dependencies = [ "portable-atomic", ] @@ -2012,9 +2011,9 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppmd-rust" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d558c559f0450f16f2a27a1f017ef38468c1090c9ce63c8e51366232d53717b4" +checksum = "efca4c95a19a79d1c98f791f10aebd5c1363b473244630bb7dbde1dc98455a24" [[package]] name = "ppv-lite86" @@ -2120,9 +2119,9 @@ dependencies = [ [[package]] name = "rangemap" -version = "1.5.1" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" [[package]] name = "rayon" @@ -2166,9 +2165,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.12.2" +version = "1.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843bc0191f75f3e22651ae5f1e72939ab2f72a4bc30fa80a066bd66edefc24d4" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" dependencies = [ "aho-corasick", "memchr", @@ -2178,9 +2177,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -2189,15 +2188,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.6" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "a96887878f22d7bad8a3b6dc5b7440e0ada9a245242924394987b21cf2210a4c" [[package]] name = "rust-embed" @@ -2272,7 +2271,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.11.0", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2427,9 +2426,9 @@ checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" [[package]] name = "slab" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "smallvec" @@ -2529,7 +2528,7 @@ dependencies = [ "getrandom 0.3.4", "once_cell", "rustix 1.1.3", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2619,9 +2618,9 @@ dependencies = [ [[package]] name = "time" -version = "0.3.45" +version = "0.3.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" +checksum = "9da98b7d9b7dad93488a84b8248efc35352b0b2657397d4167e7ad67e5d535e5" dependencies = [ "deranged", "itoa", @@ -2635,15 +2634,15 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b36ee98fd31ec7426d599183e8fe26932a8dc1fb76ddb6214d05493377d34ca" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" [[package]] name = "time-macros" -version = "0.2.25" +version = "0.2.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71e552d1249bf61ac2a52db88179fd0673def1e1ad8243a00d9ec9ed71fee3dd" +checksum = "78cc610bac2dcee56805c99642447d4c5dbde4d01f752ffea0199aee1f601dc4" dependencies = [ "num-conv", "time-core", @@ -2762,9 +2761,9 @@ dependencies = [ [[package]] name = "typed-path" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e43ffa54726cdc9ea78392023ffe9fe9cf9ac779e1c6fcb0d23f9862e3879d20" +checksum = "3015e6ce46d5ad8751e4a772543a30c7511468070e98e64e20165f8f81155b64" [[package]] name = "typenum" @@ -2774,9 +2773,9 @@ checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" [[package]] name = "unicase" -version = "2.8.1" +version = "2.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75b844d17643ee918803943289730bec8aac480150456169e647ed0b576ba539" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" [[package]] name = "unicode-bidi" @@ -2843,9 +2842,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.19.0" +version = "1.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a" +checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f" dependencies = [ "js-sys", "wasm-bindgen", @@ -2983,7 +2982,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -3177,9 +3176,9 @@ checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" [[package]] name = "xml-rs" -version = "0.8.25" +version = "0.8.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5b940ebc25896e71dd073bad2dbaa2abfe97b0a391415e22ad1326d9c54e3c4" +checksum = "3ae8337f8a065cfc972643663ea4279e04e7256de865aa66fe25cec5fb912d3f" [[package]] name = "yoke" @@ -3206,18 +3205,18 @@ dependencies = [ [[package]] name = "zerocopy" -version = "0.8.33" +version = "0.8.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd" +checksum = "57cf3aa6855b23711ee9852dfc97dfaa51c45feaba5b645d0c777414d494a961" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.33" +version = "0.8.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1" +checksum = "8a616990af1a287837c4fe6596ad77ef57948f787e46ce28e166facc0cc1cb75" dependencies = [ "proc-macro2", "quote", @@ -3354,15 +3353,15 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.5.5" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" +checksum = "a7948af682ccbc3342b6e9420e8c51c1fe5d7bf7756002b4a3c6cabfe96a7e3c" [[package]] name = "zmij" -version = "1.0.16" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" +checksum = "3ff05f8caa9038894637571ae6b9e29466c1f4f829d26c9b28f869a29cbe3445" [[package]] name = "zopfli" diff --git a/Cargo.toml b/Cargo.toml index d1e539e..73e9fd9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,10 +15,9 @@ categories = ["text-processing", "parsing"] calamine = "0.32.0" docx-rs = "0.4.18" infer = "0.19.0" -lazy_static = "1.5.0" mime = "0.3.17" pdf-extract = "0.10.0" -regex = "1.12.2" +regex = "1.12.3" tempfile = "3.24.0" tesseract = "0.15.2" zip = "7.2.0" @@ -38,6 +37,9 @@ env_logger = "0.11.8" criterion = "0.8" num_cpus = "1.17.0" +[lints.clippy] +pedantic = "warn" + [[bench]] name = "function_parse" harness = false diff --git a/benches/function_parse.rs b/benches/function_parse.rs index a5eabac..88289f0 100644 --- a/benches/function_parse.rs +++ b/benches/function_parse.rs @@ -56,7 +56,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) { .par_iter() .map(|d| parse(black_box(d))) .collect::, ParserError>>() - }) + }); }); // Benchmark sequential parsing @@ -66,7 +66,7 @@ fn benchmark_sequential_vs_parallel(c: &mut Criterion) { .iter() .map(|d| parse(black_box(d))) .collect::, ParserError>>() - }) + }); }); group.finish(); @@ -98,7 +98,7 @@ fn benchmark_parallel_efficiency(c: &mut Criterion) { .par_iter() .map(|d| parse(black_box(d))) .collect::, ParserError>>() - }) + }); }); } @@ -123,7 +123,7 @@ fn benchmark_per_filetype(c: &mut Criterion) { .par_iter() .map(|d| parse(black_box(d))) .collect::, ParserError>>() - }) + }); }); } @@ -144,7 +144,7 @@ fn benchmark_per_filetype(c: &mut Criterion) { .par_iter() .map(|d| parse(black_box(d))) .collect::, ParserError>>() - }) + }); }); } @@ -153,6 +153,7 @@ fn benchmark_per_filetype(c: &mut Criterion) { // Finds the threshold number of files for each type that takes less than 16ms fn benchmark_parallel_threshold(c: &mut Criterion) { + const SAMPLE_COUNT: usize = 5; let max_time_threshold = Duration::from_millis(16); // Read each test file only once @@ -181,7 +182,6 @@ fn benchmark_parallel_threshold(c: &mut Criterion) { } // Take multiple measurements and use median for robustness - const SAMPLE_COUNT: usize = 5; let mut durations = Vec::with_capacity(SAMPLE_COUNT); for _ in 0..SAMPLE_COUNT { @@ -227,13 +227,16 @@ fn benchmark_parallel_threshold(c: &mut Criterion) { // The threshold count is now in 'low' let threshold_count = low; - // Define percentages to test around the threshold - let percentages = [99.0, 99.9, 100.0, 100.1, 101.0]; + // Permille values for percentages: 99.0%, 99.9%, 100.0%, 100.1%, 101.0% + let permille_values: [usize; 5] = [990, 999, 1000, 1001, 1010]; - // Generate test points based on percentages of the threshold - let mut test_points: Vec = percentages + // Generate test points based on percentages of the threshold using integer math + let mut test_points: Vec = permille_values .iter() - .map(|&p| ((threshold_count as f64 * p / 100.0).ceil() as usize).max(1)) + .map(|&p| { + let product = threshold_count.saturating_mul(p); + product.div_ceil(1000).max(1) + }) .collect(); test_points.dedup(); @@ -251,7 +254,7 @@ fn benchmark_parallel_threshold(c: &mut Criterion) { .par_iter() .map(|d| parse(black_box(d))) .collect::, ParserError>>() - }) + }); }); } diff --git a/src/core/constants.rs b/src/core/constants.rs index 4e23e8b..2d30b79 100644 --- a/src/core/constants.rs +++ b/src/core/constants.rs @@ -13,6 +13,6 @@ pub const APPLICATION_DOCX: &str = pub const APPLICATION_XLSX: &str = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; -/// MIME type for PPTX (Microsoft PowerPoint) presentations +/// MIME type for PPTX (Microsoft `PowerPoint`) presentations pub const APPLICATION_PPTX: &str = "application/vnd.openxmlformats-officedocument.presentationml.presentation"; diff --git a/src/core/errors.rs b/src/core/errors.rs index 33d2c01..13c3602 100644 --- a/src/core/errors.rs +++ b/src/core/errors.rs @@ -31,14 +31,14 @@ pub enum ParserError { impl std::fmt::Display for ParserError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - ParserError::IoError(msg) => write!(f, "IO error: {}", msg), - ParserError::ParseError(msg) => write!(f, "Parse error: {}", msg), - ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {}", msg), + ParserError::IoError(msg) => write!(f, "IO error: {msg}"), + ParserError::ParseError(msg) => write!(f, "Parse error: {msg}"), + ParserError::InvalidFormat(msg) => write!(f, "Invalid format: {msg}"), } } } -/// Implements the std::error::Error trait for ParserError to allow it to be used +/// Implements the `std::error::Error` trait for `ParserError` to allow it to be used /// with the ? operator and to be boxed as a dyn Error. impl std::error::Error for ParserError {} diff --git a/src/core/parsers.rs b/src/core/parsers.rs index be53e15..3b5c675 100644 --- a/src/core/parsers.rs +++ b/src/core/parsers.rs @@ -21,14 +21,12 @@ use super::{ errors::ParserError, }; use infer::Infer; -use lazy_static::lazy_static; use mime::{IMAGE, Mime, TEXT, TEXT_PLAIN}; use std::str; +use std::sync::LazyLock; // Create a static infer instance to avoid recreating it on every call -lazy_static! { - static ref INFER: Infer = Infer::new(); -} +static INFER: LazyLock = LazyLock::new(Infer::new); /// Parses the given data into plain text. /// @@ -74,6 +72,11 @@ lazy_static! { /// // Verify the result /// assert_eq!(result, "Hello, world! This is a sample text file."); /// ``` +/// +/// # Errors +/// +/// Returns [`ParserError::InvalidFormat`] if the file type is unsupported or unrecognized. +/// May return other [`ParserError`] variants if an error occurs during parsing. pub fn parse(data: &[u8]) -> Result { match determine_mime_type(data) { Some(mime) if mime == APPLICATION_PDF => parse_pdf(data), @@ -83,8 +86,7 @@ pub fn parse(data: &[u8]) -> Result { Some(mime) if mime.type_() == TEXT => parse_text(data), Some(mime) if mime.type_() == IMAGE => parse_image(data), Some(mime) => Err(ParserError::InvalidFormat(format!( - "Unsupported file type: {}", - mime + "Unsupported file type: {mime}" ))), None => Err(ParserError::InvalidFormat( "Could not determine file type.".to_string(), diff --git a/src/core/parsers/docx.rs b/src/core/parsers/docx.rs index a8032d6..08c8000 100644 --- a/src/core/parsers/docx.rs +++ b/src/core/parsers/docx.rs @@ -1,7 +1,7 @@ //! DOCX parser module. //! //! This module provides functionality for extracting text from Microsoft Word DOCX -//! documents using the docx_rs library. +//! documents using the `docx_rs` library. use super::super::errors::ParserError; use docx_rs::read_docx; @@ -22,7 +22,7 @@ use docx_rs::read_docx; /// /// # Implementation Notes /// -/// * Uses the docx_rs library for DOCX parsing +/// * Uses the `docx_rs` library for DOCX parsing /// * Extracts text by traversing document structure: documents → paragraphs → runs → text /// * Joins paragraphs with newlines and trims whitespace from the result /// * TODO: Consider simplifying the document traversal logic diff --git a/src/core/parsers/image.rs b/src/core/parsers/image.rs index 71c4348..00fcee0 100644 --- a/src/core/parsers/image.rs +++ b/src/core/parsers/image.rs @@ -5,7 +5,7 @@ //! various image formats including PNG, JPEG, and WebP. use super::super::errors::ParserError; -use lazy_static::lazy_static; +use std::sync::LazyLock; use std::{fs, io::Write}; use tempfile::{NamedTempFile, TempDir}; use tesseract::Tesseract; @@ -20,20 +20,18 @@ const TESSDATA_FRA: &[u8] = include_bytes!(concat!( "/assets/ocr/fra.traineddata" )); -lazy_static! { - static ref TESSDATA_DIR: TempDir = { - let dir = tempfile::tempdir().expect("Failed to create tessdata directory"); - let dir_path = dir.path(); +static TESSDATA_DIR: LazyLock = LazyLock::new(|| { + let dir = tempfile::tempdir().expect("Failed to create tessdata directory"); + let dir_path = dir.path(); - // Write language files to tessdata directory (only done once) - fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG) - .expect("Failed to write English training data"); - fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA) - .expect("Failed to write French training data"); + // Write language files to tessdata directory (only done once) + fs::write(dir_path.join("eng.traineddata"), TESSDATA_ENG) + .expect("Failed to write English training data"); + fs::write(dir_path.join("fra.traineddata"), TESSDATA_FRA) + .expect("Failed to write French training data"); - dir - }; -} + dir +}); /// Parses image data and extracts text using OCR. /// diff --git a/src/core/parsers/pdf.rs b/src/core/parsers/pdf.rs index 8c7bf75..cf07540 100644 --- a/src/core/parsers/pdf.rs +++ b/src/core/parsers/pdf.rs @@ -1,7 +1,7 @@ //! PDF parser module. //! //! This module provides functionality for extracting text from PDF documents using -//! the pdf_extract library. +//! the `pdf_extract` library. use super::super::errors::ParserError; use pdf_extract::extract_text_from_mem; @@ -22,7 +22,7 @@ use pdf_extract::extract_text_from_mem; /// /// # Implementation Notes /// -/// * Uses the pdf_extract library for PDF text extraction +/// * Uses the `pdf_extract` library for PDF text extraction /// * Trims whitespace from the result before returning /// * TODO: Need to find a way to silence the output of that function since on /// unknown characters it outputs a lot of errors, cluttering the logs. diff --git a/src/core/parsers/pptx.rs b/src/core/parsers/pptx.rs index 622b0b2..6a26447 100644 --- a/src/core/parsers/pptx.rs +++ b/src/core/parsers/pptx.rs @@ -1,6 +1,6 @@ //! PPTX parser module. //! -//! This module provides functionality for extracting text from Microsoft PowerPoint +//! This module provides functionality for extracting text from Microsoft `PowerPoint` //! PPTX presentation files. It uses the zip crate to extract slide XML files and //! regex to extract text content. @@ -46,7 +46,10 @@ pub(crate) fn parse_pptx(data: &[u8]) -> Result { let mut file = archive.by_index(i)?; // Only process slide XML files - if file.name().starts_with("ppt/slides/slide") && file.name().ends_with(".xml") { + let is_xml = std::path::Path::new(file.name()) + .extension() + .is_some_and(|ext| ext.eq_ignore_ascii_case("xml")); + if file.name().starts_with("ppt/slides/slide") && is_xml { slide_count += 1; if slide_count > 1 { text.push_str("\n--- Slide "); diff --git a/src/core/parsers/xlsx.rs b/src/core/parsers/xlsx.rs index 93b4c31..1dbb05d 100644 --- a/src/core/parsers/xlsx.rs +++ b/src/core/parsers/xlsx.rs @@ -30,7 +30,7 @@ use std::io::Cursor; /// * Adds sheet headers for multi-sheet workbooks /// * Memory-efficient implementation using cursors instead of temporary files /// * TODO: Need proper logic to escape commas and quotes -/// * TODO: Consider using the csv crate to convert each sheet and pass it through the parse_text function +/// * TODO: Consider using the csv crate to convert each sheet and pass it through the `parse_text` function pub(crate) fn parse_xlsx(data: &[u8]) -> Result { // Create a cursor from the bytes for memory-based reading let cursor = Cursor::new(data); @@ -42,7 +42,7 @@ pub(crate) fn parse_xlsx(data: &[u8]) -> Result { let mut csv_data = String::new(); // Copy the sheet names to avoid borrowing issues - let sheet_names = excel.sheet_names().to_vec(); + let sheet_names = excel.sheet_names().clone(); for name in sheet_names { if let Ok(range) = excel.worksheet_range(&name) { @@ -55,7 +55,7 @@ pub(crate) fn parse_xlsx(data: &[u8]) -> Result { .rows() .map(|row| { row.iter() - .map(|cell| cell.to_string()) + .map(std::string::ToString::to_string) .collect::>() .join(",") }) diff --git a/src/web/errors.rs b/src/web/errors.rs index b6ea0b7..72ab1d8 100644 --- a/src/web/errors.rs +++ b/src/web/errors.rs @@ -26,9 +26,9 @@ pub enum ApiError { impl std::fmt::Display for ApiError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - ApiError::BadRequest(msg) => write!(f, "Bad Request: {}", msg), - ApiError::InternalError(msg) => write!(f, "Internal Error: {}", msg), - ApiError::ConfigError(msg) => write!(f, "Configuration Error: {}", msg), + ApiError::BadRequest(msg) => write!(f, "Bad Request: {msg}"), + ApiError::InternalError(msg) => write!(f, "Internal Error: {msg}"), + ApiError::ConfigError(msg) => write!(f, "Configuration Error: {msg}"), } } } @@ -47,16 +47,18 @@ impl ResponseError for ApiError { match self { ApiError::BadRequest(_) => HttpResponse::BadRequest().json(error_response), - ApiError::InternalError(_) => HttpResponse::InternalServerError().json(error_response), - ApiError::ConfigError(_) => HttpResponse::InternalServerError().json(error_response), + ApiError::InternalError(_) | ApiError::ConfigError(_) => { + HttpResponse::InternalServerError().json(error_response) + } } } fn status_code(&self) -> StatusCode { match self { ApiError::BadRequest(_) => StatusCode::BAD_REQUEST, - ApiError::InternalError(_) => StatusCode::INTERNAL_SERVER_ERROR, - ApiError::ConfigError(_) => StatusCode::INTERNAL_SERVER_ERROR, + ApiError::InternalError(_) | ApiError::ConfigError(_) => { + StatusCode::INTERNAL_SERVER_ERROR + } } } } diff --git a/tests/endpoints.rs b/tests/endpoints.rs index 5bb671d..4edef89 100644 --- a/tests/endpoints.rs +++ b/tests/endpoints.rs @@ -22,6 +22,6 @@ fn test_file_paths_exist() { let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")) .join("tests/assets") .join(name); - assert!(path.exists(), "Test file should exist: {:?}", path); + assert!(path.exists(), "Test file should exist: {path:?}"); } }