1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
use super::{LineNumberCache, Position, PositionIterator};
use failure::Fail;
use std::ops::Deref;

#[derive(Debug)]
pub struct AsciiFile<'m> {
    // TODO: mapping should be private
    pub mapping: &'m [u8],
    line_cache: LineNumberCache,
}

#[derive(Debug, Fail)]
pub enum EncodingError {
    #[fail(display = "input contains non-ascii character at {}", position)]
    NotAscii { offset: usize, position: String },
}

impl<'m> AsciiFile<'m> {
    pub fn lookup_cache(&self) -> &LineNumberCache {
        &self.line_cache
    }

    // cost: O(fileLen) since we need to check if all chars are ASCII
    pub fn new(mapping: &'m [u8]) -> Result<AsciiFile<'m>, EncodingError> {
        // TODO: move ascii validation out of constructor
        // TODO: we compute the newline lookup here to ensure the whole
        // previous file was traversed when a position is read. But as
        // positions can only be generated by traversing the linked list
        // this invariant is always met, even when we don't compute the
        // info here.
        let mut linebreaks = vec![];

        for (offset, byte) in mapping.iter().enumerate() {
            if !byte.is_ascii() {
                let linecache = LineNumberCache::new(linebreaks);
                let (row, col) = linecache.row_and_column(offset);
                return Err(EncodingError::NotAscii {
                    offset,
                    position: format!("{}:{}", row + 1, col + 1),
                });
            }
            if *byte == b'\n' {
                linebreaks.push(offset);
            }
        }

        linebreaks.shrink_to_fit();

        Ok(AsciiFile {
            mapping,
            line_cache: LineNumberCache::new(linebreaks),
        })
    }

    pub fn iter(&self) -> PositionIterator<'_> {
        PositionIterator::new(Position::at_file_start(self))
    }
}

impl<'m> Deref for AsciiFile<'m> {
    type Target = str;
    fn deref(&self) -> &Self::Target {
        unsafe { std::str::from_utf8_unchecked(&self.mapping) }
    }
}

impl<'m, 'a> Into<&'m str> for &'a AsciiFile<'m> {
    fn into(self) -> &'m str {
        unsafe { std::str::from_utf8_unchecked(&self.mapping) }
    }
}

#[cfg(test)]
#[allow(clippy::print_stdout, clippy::use_debug)]
mod tests {
    use super::*;

    #[test]
    fn works_with_ascii() {
        let string = "ABCDEFG\n\t";
        let file = AsciiFile::new(string.as_bytes()).unwrap();
        let contents: &str = &file;
        assert!(string == contents);
    }

    #[test]
    fn returns_err_on_non_ascii() {
        let input = "one💩two";
        let file = AsciiFile::new(input.as_bytes());
        assert!(file.is_err());
        let e = file.err().unwrap();
        println!("{:?}", e);
        let EncodingError::NotAscii { offset, position } = e;
        assert_eq!(offset, 3);
        assert_eq!(position, "1:4");
    }

    #[test]
    fn returns_err_on_non_ascii_non_utf8() {
        let input: Vec<u16> = "ä".encode_utf16().collect();
        let input: &[u8] =
            unsafe { std::slice::from_raw_parts(input.as_ptr() as *const u8, 2 * input.len()) };

        // check if we generated the invalid string correctly
        assert_eq!(input.len(), 2);
        assert!(std::str::from_utf8(input).is_err());

        // check if it is rejected
        let file = AsciiFile::new(&input);
        assert!(file.is_err());

        let EncodingError::NotAscii { offset, .. } = file.err().unwrap();
        assert_eq!(offset, 0);
    }
}