#![feature(try_from)]
#![warn(
clippy::print_stdout,
clippy::unimplemented,
clippy::doc_markdown,
clippy::items_after_statements,
clippy::match_same_arms,
clippy::similar_names,
clippy::single_match_else,
clippy::use_self,
clippy::use_debug
)]
pub mod lextest;
#[macro_use]
extern crate derive_more;
use asciifile::{Position, PositionIterator, Span, Spanned};
use compiler_shared::context::Context;
use diagnostics::u8_to_printable_representation;
use failure::Fail;
use std::{convert::TryFrom, fmt, result::Result};
use strtab::*;
use utils::*;
macro_rules! match_op {
($input:expr, $( ($token_string:expr, $token:expr) ),+: $len:expr, $default:expr) => {{
let span = $input.peek_at_most($len).unwrap();
#[allow(clippy::single_match_else)]
match span.as_str() {
$(
$token_string => match_op!($input, span, $len, $token),
)+
_ => $default,
}
}};
($input:expr, $span:ident, $len:expr, $right:expr) => {{
debug_assert!($len >= 1);
for _ in 0..$len { $input.next().unwrap(); }
Some(Ok(Token::new($span, TokenKind::Operator($right))))
}};
}
pub type TokenResult<'f> = Result<Token<'f>, LexicalError<'f>>;
pub type Token<'f> = Spanned<'f, TokenKind<'f>>;
pub type LexicalError<'f> = Spanned<'f, ErrorKind>;
pub type IntLit<'f> = &'f str;
#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd, Ord, Display)]
pub enum TokenKind<'f> {
#[display(fmt = "'{}'", _0)]
Keyword(Keyword),
#[display(fmt = "`{}`", _0)]
Operator(Operator),
#[display(fmt = "identifier `{}`", _0)]
Identifier(Symbol<'f>),
#[display(fmt = "integer literal `{}`", _0)]
IntegerLiteral(IntLit<'f>),
#[display(fmt = "a comment")]
Comment(&'f str),
#[display(fmt = "whitespace")]
Whitespace,
}
#[derive(Debug, Fail)]
pub enum ErrorKind {
UnclosedComment,
UnexpectedCharacter(u8),
}
impl fmt::Display for ErrorKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match *self {
ErrorKind::UnclosedComment => write!(f, "unclosed comment"),
ErrorKind::UnexpectedCharacter(byte) => fmt_unexpected_character(f, byte),
}
}
}
fn fmt_unexpected_character(f: &mut fmt::Formatter<'_>, byte: u8) -> fmt::Result {
match byte as char {
'\n' => write!(f, "Unexpected newline"),
'\\' => write!(f, "Unexpected backslash"),
'\'' => write!(f, "Unexpected single quote"),
'"' => write!(f, "Unexpected double quote"),
chr if chr.is_whitespace() => write!(
f,
"Unexpected whitespace '{}'",
u8_to_printable_representation(byte)
),
chr if chr.is_control() => write!(
f,
"Unexpected control character '{}'",
u8_to_printable_representation(byte)
),
_ => write!(
f,
"Unexpected character '{}'",
u8_to_printable_representation(byte)
),
}
}
#[derive(Debug, Fail)]
pub enum Warning {
#[fail(display = "confusing usage of comment separator inside a comment")]
CommentSeparatorInsideComment,
}
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
pub enum Keyword {
Abstract,
Assert,
Boolean,
Break,
Byte,
Case,
Catch,
Char,
Class,
Const,
Continue,
Default,
Double,
Do,
Else,
Enum,
Extends,
False,
Finally,
Final,
Float,
For,
Goto,
If,
Implements,
Import,
InstanceOf,
Interface,
Int,
Long,
Native,
New,
Null,
Package,
Private,
Protected,
Public,
Return,
Short,
Static,
StrictFp,
Super,
Switch,
Synchronized,
This,
Throws,
Throw,
Transient,
True,
Try,
Void,
Volatile,
While,
}
macro_rules! derive_keyword_display_and_tryfrom {
($( ( $keywordvariant:path = $minijavarep:expr ) ),+ ) => {
impl fmt::Display for Keyword {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use self::Keyword::*;
let mj_rep = match self {
$( $keywordvariant => $minijavarep ),* ,
};
write!(f, "{}", mj_rep)
}
}
impl TryFrom<&str> for Keyword {
type Error = ();
fn try_from(s: &str) -> Result<Keyword, Self::Error> {
use self::Keyword::*;
match s {
$( $minijavarep => Ok($keywordvariant) ),* ,
_ => Err(()),
}
}
}
};
}
derive_keyword_display_and_tryfrom! {
(Abstract = "abstract"),
(Assert = "assert"),
(Boolean = "boolean"),
(Break = "break"),
(Byte = "byte"),
(Case = "case"),
(Catch = "catch"),
(Char = "char"),
(Class = "class"),
(Const = "const"),
(Continue = "continue"),
(Default = "default"),
(Double = "double"),
(Do = "do"),
(Else = "else"),
(Enum = "enum"),
(Extends = "extends"),
(False = "false"),
(Finally = "finally"),
(Final = "final"),
(Float = "float"),
(For = "for"),
(Goto = "goto"),
(If = "if"),
(Implements = "implements"),
(Import = "import"),
(InstanceOf = "instanceof"),
(Interface = "interface"),
(Int = "int"),
(Long = "long"),
(Native = "native"),
(New = "new"),
(Null = "null"),
(Package = "package"),
(Private = "private"),
(Protected = "protected"),
(Public = "public"),
(Return = "return"),
(Short = "short"),
(Static = "static"),
(StrictFp = "strictfp"),
(Super = "super"),
(Switch = "switch"),
(Synchronized = "synchronized"),
(This = "this"),
(Throws = "throws"),
(Throw = "throw"),
(Transient = "transient"),
(True = "true"),
(Try = "try"),
(Void = "void"),
(Volatile = "volatile"),
(While = "while")
}
#[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
pub enum Operator {
ExclaimEqual,
Exclaim,
LeftParen,
RightParen,
StarEqual,
Star,
DoublePlus,
PlusEqual,
Plus,
Comma,
MinusEqual,
DoubleMinus,
Minus,
Dot,
SlashEqual,
Slash,
Colon,
Semicolon,
DoubleLeftChevronEqual,
DoubleLeftChevron,
LeftChevronEqual,
LeftChevron,
DoubleEqual,
Equal,
RightChevronEqual,
DoubleRightChevronEqual,
TripleRightChevronEqual,
TripleRightChevron,
DoubleRightChevron,
RightChevron,
QuestionMark,
PercentEqual,
Percent,
AmpersandEqual,
DoubleAmpersand,
Ampersand,
LeftBracket,
RightBracket,
CaretEqual,
Caret,
LeftBrace,
RightBrace,
Tilde,
PipeEqual,
DoublePipe,
Pipe,
}
impl fmt::Display for Operator {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
use self::Operator::*;
write!(
f,
"{}",
match self {
TripleRightChevronEqual => ">>>=",
DoubleLeftChevronEqual => "<<=",
DoubleRightChevronEqual => ">>=",
TripleRightChevron => ">>>",
ExclaimEqual => "!=",
StarEqual => "*=",
DoublePlus => "++",
PlusEqual => "+=",
MinusEqual => "-=",
DoubleMinus => "--",
SlashEqual => "/=",
DoubleLeftChevron => "<<",
LeftChevronEqual => "<=",
DoubleEqual => "==",
RightChevronEqual => ">=",
DoubleRightChevron => ">>",
PercentEqual => "%=",
AmpersandEqual => "&=",
DoubleAmpersand => "&&",
CaretEqual => "^=",
PipeEqual => "|=",
DoublePipe => "||",
Exclaim => "!",
LeftParen => "(",
RightParen => ")",
Star => "*",
Plus => "+",
Comma => ",",
Minus => "-",
Dot => ".",
Slash => "/",
Colon => ":",
Semicolon => ";",
LeftChevron => "<",
Equal => "=",
RightChevron => ">",
QuestionMark => "?",
Percent => "%",
Ampersand => "&",
LeftBracket => "[",
RightBracket => "]",
Caret => "^",
LeftBrace => "{",
RightBrace => "}",
Tilde => "~",
Pipe => "|",
}
)
}
}
pub struct Lexer<'f, 's> {
input: PositionIterator<'f>,
strtab: &'s mut StringTable<'f>,
context: &'f Context<'f>,
}
fn is_minijava_whitespace(c: char) -> bool {
match c {
' ' | '\n' | '\r' | '\t' => true,
_ => false,
}
}
impl<'f, 's> Lexer<'f, 's> {
pub fn new(strtab: &'s mut StringTable<'f>, context: &'f Context<'f>) -> Self {
let input = context.file.iter();
Self {
context,
strtab,
input,
}
}
fn lex_token(&mut self) -> Option<TokenResult<'f>> {
match self.input.peek() {
Some(position) => Some(match position.chr() {
'a'..='z' | 'A'..='Z' | '_' => self.lex_identifier_or_keyword(),
'1'..='9' => self.lex_integer_literal(),
'0' => self.lex_zero_integer_literal(),
c if is_minijava_whitespace(c) => self.lex_whitespace(),
'/' if self.input.matches("/*") => self.lex_comment(),
_ => self.lex_operator().unwrap_or_else(|| {
Err(LexicalError::new(
position.to_single_char_span(),
ErrorKind::UnexpectedCharacter(position.byte()),
))
}),
}),
None => None,
}
}
fn lex_zero_integer_literal(&mut self) -> TokenResult<'f> {
let position = self.input.next().unwrap();
Ok(Token::new(
position.to_single_char_span(),
TokenKind::IntegerLiteral("0"),
))
}
fn lex_identifier_or_keyword(&mut self) -> TokenResult<'f> {
assert_matches!(self.input.peek().unwrap().chr(), 'a'..='z' | 'A'..='Z' | '_');
let span = self.lex_while(
|position, _| matches!(position.chr(), 'a'..='z' | 'A'..='Z' | '0'..='9' | '_'),
);
let kind = match Keyword::try_from(span.as_str()) {
Ok(keyword) => TokenKind::Keyword(keyword),
Err(_) => TokenKind::Identifier(self.strtab.intern(span.as_str())),
};
Ok(Token::new(span, kind))
}
fn lex_integer_literal(&mut self) -> TokenResult<'f> {
assert_matches!(self.input.peek().unwrap().chr(), '1'..='9');
let span = self.lex_while(|position, _| matches!(position.chr(), '0'..='9'));
let kind = TokenKind::IntegerLiteral(span.as_str());
Ok(Token::new(span, kind))
}
fn lex_comment(&mut self) -> TokenResult<'f> {
debug_assert!(self.input.matches("/*"));
let comment_start = self.input.next().unwrap();
self.input.next().unwrap();
let comment_body = self.lex_while_multiple(2, |span, context| {
if span.as_str() == "/*" {
context.diagnostics.warning(&Spanned {
span,
data: Warning::CommentSeparatorInsideComment,
});
}
span.as_str() != "*/"
});
if self.input.eof_reached() {
let span = comment_body.extend_to_position(&comment_start);
Err(LexicalError::new(span, ErrorKind::UnclosedComment))
} else {
debug_assert_eq!(self.input.peek_exactly(2).unwrap().as_str(), "*/");
self.input.next().unwrap();
let comment_end = self.input.next().unwrap();
let span = Span::new(comment_start, comment_end);
Ok(Token::new(span, TokenKind::Comment(comment_body.as_str())))
}
}
fn lex_whitespace(&mut self) -> TokenResult<'f> {
debug_assert!(is_minijava_whitespace(self.input.peek().unwrap().chr()));
let span = self.lex_while(|position, _| is_minijava_whitespace(position.chr()));
Ok(Token::new(span, TokenKind::Whitespace))
}
#[allow(clippy::cyclomatic_complexity)]
fn lex_operator(&mut self) -> Option<TokenResult<'f>> {
use self::Operator::*;
match_op!(
self.input,
(">>>=", TripleRightChevronEqual):
4,
match_op!(
self.input,
("<<=", DoubleLeftChevronEqual),
(">>=", DoubleRightChevronEqual),
(">>>", TripleRightChevron):
3,
match_op!(
self.input,
("!=", ExclaimEqual),
("*=", StarEqual),
("++", DoublePlus),
("+=", PlusEqual),
("-=", MinusEqual),
("--", DoubleMinus),
("/=", SlashEqual),
("<<", DoubleLeftChevron),
("<=", LeftChevronEqual),
("==", DoubleEqual),
(">=", RightChevronEqual),
(">>", DoubleRightChevron),
("%=", PercentEqual),
("&=", AmpersandEqual),
("&&", DoubleAmpersand),
("^=", CaretEqual),
("|=", PipeEqual),
("||", DoublePipe):
2,
match_op!(
self.input,
("!", Exclaim),
("(", LeftParen),
(")", RightParen),
("*", Star),
("+", Plus),
(",", Comma),
("-", Minus),
(".", Dot),
("/", Slash),
(":", Colon),
(";", Semicolon),
("<", LeftChevron),
("=", Equal),
(">", RightChevron),
("?", QuestionMark),
("%", Percent),
("&", Ampersand),
("[", LeftBracket),
("]", RightBracket),
("^", Caret),
("{", LeftBrace),
("}", RightBrace),
("~", Tilde),
("|", Pipe):
1,
None
)
)
)
)
}
fn lex_while<P>(&mut self, predicate: P) -> Span<'f>
where
P: Fn(Position<'f>, &'f Context<'f>) -> bool,
{
self.lex_while_multiple(1, |span, context| predicate(span.start_position(), context))
}
fn lex_while_multiple<P>(&mut self, n: usize, predicate: P) -> Span<'f>
where
P: Fn(Span<'f>, &'f Context<'f>) -> bool,
{
let mut consumed = self.input.peek().unwrap().to_single_char_span();
while let Some(peeked) = self.input.peek_at_most(n) {
if !predicate(peeked, &self.context) {
break;
}
let position = self.input.next().unwrap();
consumed = consumed.extend_to_position(&position);
}
consumed
}
}
impl<'f, 's> Iterator for Lexer<'f, 's> {
type Item = TokenResult<'f>;
fn next(&mut self) -> Option<Self::Item> {
self.lex_token()
}
}
#[cfg(test)]
#[allow(clippy::print_stdout, clippy::use_debug)]
mod tests {
use super::{is_minijava_whitespace, Keyword, Operator, TokenKind};
use crate::lextest;
use failure::Error;
use std::io;
use strtab::StringTable;
fn write_token<O: io::Write>(out: &mut O, token: &TokenKind<'_>) -> Result<(), Error> {
match token {
TokenKind::Whitespace | TokenKind::Comment(_) => Ok(()),
_ => {
writeln!(out, "{}", lextest::Output::new(&token))?;
Ok(())
}
}
}
fn lexer_test_with_tokens(tokens: Vec<TokenKind<'_>>) -> String {
let mut o = Vec::new();
for token in tokens.into_iter() {
let res = write_token(&mut o, &token);
assert!(res.is_ok());
}
String::from_utf8(o).expect("output must be utf8")
}
#[test]
fn minijava_whitespace() {
let chars = "\x07\x08\x0c\x0b";
for c in chars.chars() {
println!("{:?}", c);
assert_eq!(is_minijava_whitespace(c), false)
}
}
#[test]
fn newline_per_token() {
let tokens = vec![
TokenKind::Operator(Operator::Ampersand),
TokenKind::Keyword(Keyword::Int),
];
let tokens_len = tokens.len();
let o = lexer_test_with_tokens(tokens);
assert_eq!(o.lines().count(), tokens_len);
}
#[test]
fn no_whitespace_and_comments() {
let tokens = vec![
TokenKind::Operator(Operator::Ampersand),
TokenKind::Whitespace,
TokenKind::IntegerLiteral("foo"),
TokenKind::Comment("comment"),
TokenKind::Keyword(Keyword::If),
];
let o = lexer_test_with_tokens(tokens);
assert_eq!(o.lines().count(), 3);
assert!(!o.contains("comment"));
assert_eq!(&o, "&\ninteger literal foo\nif\n")
}
#[test]
fn keywords_as_is() {
let tokens = vec![TokenKind::Keyword(Keyword::Float)];
let o = lexer_test_with_tokens(tokens);
assert_eq!(&o, "float\n");
}
#[test]
fn operators_as_is() {
let o = lexer_test_with_tokens(vec![TokenKind::Operator(Operator::Caret)]);
assert_eq!(&o, "^\n");
}
#[test]
fn ident_prefix() {
let mut st = StringTable::new();
let o = lexer_test_with_tokens(vec![TokenKind::Identifier(st.intern("an_identifier"))]);
assert_eq!(&o, "identifier an_identifier\n");
}
#[test]
fn integer_literal_prefix() {
let o = lexer_test_with_tokens(vec![TokenKind::IntegerLiteral("2342")]);
assert_eq!(&o, "integer literal 2342\n");
}
}