1use std::ffi::OsStr;
13use std::fs::File;
14use std::io;
15use std::io::{BufRead, BufReader, Read, Seek, SeekFrom};
16use std::path::Path;
17
18use super::errors::DictError;
19use byteorder::*;
20
21pub static MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; pub static GZ_FEXTRA: u8 = 0b0000_0100;
27pub static GZ_FNAME: u8 = 0b0000_1000; pub static GZ_COMMENT: u8 = 0b0001_0000; pub static GZ_FHCRC: u8 = 0b0000_0010;
33
34pub trait DictReader {
41 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError>;
43}
44
45pub struct DictReaderRaw<B: Read + Seek> {
49 dict_data: B,
50 total_length: u64,
51}
52
53impl<B: Read + Seek> DictReaderRaw<B> {
54 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
56 pub fn new(mut dict_data: B) -> Result<DictReaderRaw<B>, DictError> {
57 let end = dict_data.seek(SeekFrom::End(0))?;
58 Ok(DictReaderRaw {
59 dict_data,
60 total_length: end,
61 })
62 }
63}
64
65impl<B: Read + Seek> DictReader for DictReaderRaw<B> {
66 #[cfg_attr(
68 feature = "tracing",
69 tracing::instrument(skip(self), fields(start_offset, length))
70 )]
71 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
72 if length > MAX_BYTES_FOR_BUFFER {
73 return Err(DictError::MemoryError);
74 }
75
76 if (start_offset + length) > self.total_length {
77 return Err(DictError::IoError(io::Error::new(
78 io::ErrorKind::UnexpectedEof,
79 "a \
80 seek beyond the end of uncompressed data was requested",
81 )));
82 }
83
84 self.dict_data.seek(SeekFrom::Start(start_offset))?;
85 let mut read_data = vec![0; length as usize];
86 let bytes_read = self.dict_data.read(read_data.as_mut_slice())? as u64;
87 if bytes_read != length {
88 return Err(DictError::IoError(io::Error::new(
90 io::ErrorKind::UnexpectedEof,
91 "seek beyond end of file",
92 )));
93 }
94 Ok(String::from_utf8(read_data)?)
95 }
96}
97
98#[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
109pub fn load_dict<P: AsRef<Path>>(path: P) -> Result<Box<dyn DictReader>, DictError> {
110 if path.as_ref().extension() == Some(OsStr::new("dz")) {
111 let reader = File::open(path)?;
112 Ok(Box::new(DictReaderDz::new(reader)?))
113 } else {
114 let reader = BufReader::new(File::open(path)?);
115 Ok(Box::new(DictReaderRaw::new(reader)?))
116 }
117}
118
119pub struct DictReaderDz<B: Read + Seek> {
125 dzdict: B,
127 uchunk_length: usize,
129 end_compressed_data: usize,
131 chunk_offsets: Vec<usize>,
133 ufile_length: u64, }
136
137#[derive(Debug)]
138struct Chunk {
140 offset: usize,
141 length: usize,
142}
143
144impl<B: Read + Seek> DictReaderDz<B> {
145 #[cfg_attr(feature = "tracing", tracing::instrument(skip_all))]
147 pub fn new(dzdict: B) -> Result<DictReaderDz<B>, DictError> {
148 let mut buffered_dzdict = BufReader::new(dzdict);
149 let mut header = vec![0u8; 12];
150 buffered_dzdict.read_exact(&mut header)?;
151 if header[0..2] != [0x1F, 0x8B] {
152 return Err(DictError::InvalidFileFormat(
153 "Not in gzip format".into(),
154 None,
155 ));
156 }
157
158 let flags = &header[3]; if (flags & GZ_FEXTRA) == 0 {
160 return Err(DictError::InvalidFileFormat(
162 "Extra flag (FLG.FEXTRA) \
163 not set, not in gzip + dzip format"
164 .into(),
165 None,
166 ));
167 }
168
169 let xlen = LittleEndian::read_u16(&header[10..12]);
171
172 let mut fextra = vec![0u8; xlen as usize];
174 buffered_dzdict.read_exact(&mut fextra)?;
175
176 if fextra[0..2] != [b'R', b'A'] {
177 return Err(DictError::InvalidFileFormat(
178 "No dictzip info found in FEXTRA \
179 header (behind XLEN, in SI1SI2 fields)"
180 .into(),
181 None,
182 ));
183 }
184
185 let length_subfield = LittleEndian::read_u16(&fextra[2..4]);
186 assert_eq!(
187 length_subfield,
188 xlen - 4,
189 "the length of the subfield \
190 should be the same as the fextra field, ignoring the \
191 additional length information and the file format identification"
192 );
193 let subf_version = LittleEndian::read_u16(&fextra[4..6]);
194 if subf_version != 1 {
195 return Err(DictError::InvalidFileFormat(
196 "Unimplemented dictzip \
197 version, only ver 1 supported"
198 .into(),
199 None,
200 ));
201 }
202
203 let uchunk_length = LittleEndian::read_u16(&fextra[6..8]);
206 let chunk_count = LittleEndian::read_u16(&fextra[8..10]);
208 if chunk_count == 0 {
209 return Err(DictError::InvalidFileFormat(
210 "No compressed chunks in \
211 file or broken header information"
212 .into(),
213 None,
214 ));
215 }
216
217 let numbers_chunks_which_would_fit = ((fextra.len() - 10) / 2) as u16; if numbers_chunks_which_would_fit != chunk_count {
223 return Err(DictError::InvalidFileFormat(
224 format!(
225 "Expected {} chunks \
226 according to dictzip header, but the FEXTRA field can \
227 accomodate {}; possibly broken file",
228 chunk_count, numbers_chunks_which_would_fit
229 ),
230 None,
231 ));
232 }
233
234 if (flags & GZ_FNAME) != 0 {
236 let mut tmp = Vec::new();
237 buffered_dzdict.read_until(b'\0', &mut tmp)?;
238 }
239
240 if (flags & GZ_COMMENT) != 0 {
242 let mut tmp = Vec::new();
243 buffered_dzdict.read_until(b'\0', &mut tmp)?;
244 }
245
246 if (flags & GZ_FHCRC) != 0 {
248 buffered_dzdict.seek(SeekFrom::Current(2))?;
249 }
250
251 let mut chunk_offsets = Vec::with_capacity(chunk_count as usize);
253 let mut end_compressed_data = buffered_dzdict.seek(SeekFrom::Current(0))? as usize;
255 let chunks_from_header = &fextra[10usize..(10 + chunk_count * 2) as usize];
257
258 for index in (0..chunks_from_header.len()).filter(|i| (i % 2) == 0) {
260 let index = index as usize;
261 let compressed_len =
262 LittleEndian::read_u16(&chunks_from_header[index..(index + 2)]) as usize;
263 chunk_offsets.push(end_compressed_data);
264 end_compressed_data += compressed_len;
265 }
266 assert_eq!(
267 chunk_offsets.len() as u16,
268 chunk_count,
269 "The read number of compressed chunks in \
270 the .dz file must be equivalent to the number of chunks actually found in the file.\n"
271 );
272
273 buffered_dzdict.seek(SeekFrom::Start(end_compressed_data as u64))?;
275 let uncompressed = buffered_dzdict.read_i32::<LittleEndian>()?;
276
277 Ok(DictReaderDz {
278 dzdict: buffered_dzdict.into_inner(),
279 chunk_offsets,
280 end_compressed_data,
281 uchunk_length: uchunk_length as usize,
282 ufile_length: uncompressed as u64,
283 })
284 }
285
286 #[cfg_attr(
287 feature = "tracing",
288 tracing::instrument(skip(self), fields(start_offset, length))
289 )]
290 fn get_chunks_for(&self, start_offset: u64, length: u64) -> Vec<Chunk> {
291 let mut chunks = Vec::new();
292 let start_chunk = start_offset as usize / self.uchunk_length;
293 let end_chunk = (start_offset + length) as usize / self.uchunk_length;
294 for id in start_chunk..=end_chunk {
295 let chunk_length = match self.chunk_offsets.get(id + 1) {
296 Some(next) => next - self.chunk_offsets[id],
297 None => self.end_compressed_data - self.chunk_offsets[id],
298 };
299 chunks.push(Chunk {
300 offset: self.chunk_offsets[id],
301 length: chunk_length,
302 });
303 }
304
305 chunks
306 }
307
308 #[cfg_attr(feature = "tracing", tracing::instrument(skip(self), fields(data_len = data.len())))]
310 fn inflate(&self, data: Vec<u8>) -> Result<Vec<u8>, DictError> {
311 let mut decoder = flate2::Decompress::new(false);
312 let mut decoded = vec![0u8; self.uchunk_length];
313 decoder.decompress(
314 data.as_slice(),
315 decoded.as_mut_slice(),
316 flate2::FlushDecompress::None,
317 )?;
318 Ok(decoded)
319 }
320}
321
322impl<B: Read + Seek> DictReader for DictReaderDz<B> {
323 #[cfg_attr(
325 feature = "tracing",
326 tracing::instrument(skip(self), fields(start_offset, length))
327 )]
328 fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result<String, DictError> {
329 if length > MAX_BYTES_FOR_BUFFER {
330 return Err(DictError::MemoryError);
331 }
332 if (start_offset + length) > self.ufile_length {
333 return Err(DictError::IoError(io::Error::new(
334 io::ErrorKind::UnexpectedEof,
335 "a \
336 seek beyond the end of uncompressed data was requested",
337 )));
338 }
339 let mut data = Vec::new();
340 for chunk in self.get_chunks_for(start_offset, length) {
341 let pos = self.dzdict.seek(SeekFrom::Start(chunk.offset as u64))?;
342 if pos != (chunk.offset as u64) {
343 return Err(DictError::IoError(io::Error::new(
344 io::ErrorKind::Other,
345 format!(
346 "attempted to seek to {} but new position is {}",
347 chunk.offset, pos
348 ),
349 )));
350 }
351 let mut definition = vec![0u8; chunk.length];
352 self.dzdict.read_exact(&mut definition)?;
353 data.push(self.inflate(definition)?);
354 }
355
356 let cut_front = start_offset as usize % self.uchunk_length;
358 let data = match data.len() {
360 0 => panic!(),
361 1 => data[0][cut_front..cut_front + length as usize].to_vec(),
362 n => {
363 let mut tmp = data[0][cut_front..].to_vec();
364 for text in data.iter().skip(1).take(n - 2) {
366 tmp.extend_from_slice(text);
367 }
368 let remaining_bytes = (length as usize + cut_front) % self.uchunk_length;
370 tmp.extend_from_slice(&data[n - 1][..remaining_bytes]);
371 tmp
372 }
373 };
374 Ok(String::from_utf8(data)?)
375 }
376}