// Copyright 2017 Jeremy Wall // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //! The tokenization stage of the ucg compiler. use std; use abortable_parser::combinators::*; use abortable_parser::iter::SliceIter; use abortable_parser::{Error, Result}; use crate::ast::*; use crate::error::StackPrinter; use crate::iter::OffsetStrIter; fn is_symbol_char<'a>(i: OffsetStrIter<'a>) -> Result, u8> { let mut _i = i.clone(); let c = match _i.next() { Some(c) => *c, None => { return Result::Fail(Error::new( "Unexpected End of Input".to_string(), Box::new(_i.clone()), )); } }; if (c as char).is_ascii_alphanumeric() || c == b'-' || c == b'_' { Result::Complete(_i, c) } else { Result::Fail(Error::new( "Not a symbol character".to_string(), Box::new(_i.clone()), )) } } fn escapequoted<'a>(input: OffsetStrIter<'a>) -> Result, String> { // loop until we find a " that is not preceded by \. // Collapse all \ to just char for escaping. let mut frag = String::new(); let mut escape = false; let mut _input = input.clone(); loop { let c = match _input.next() { Some(c) => *c, None => break, }; if c == '\\' as u8 && !escape { // eat this slash and set our escaping sentinel escape = true; } else if c == '"' as u8 && !escape { // Bail if this is an unescaped " // we exit here. return Result::Complete(_input, frag); } else { // we accumulate this character. frag.push(c as char); escape = false; // reset our escaping sentinel } } return Result::Incomplete(_input.clone()); } make_fn!(strtok, do_each!( span => input!(), _ => text_token!("\""), frag => escapequoted, (Token{ typ: TokenType::QUOTED, pos: Position::from(&span), fragment: frag.to_string(), }) ) ); make_fn!(barewordtok, do_each!( span => input!(), _ => peek!(ascii_alpha), frag => consume_all!(is_symbol_char), (Token{ typ: TokenType::BAREWORD, pos: Position::from(&span), fragment: frag.to_string(), }) ) ); make_fn!(digittok, do_each!( span => input!(), _ => peek!(ascii_digit), digits => consume_all!(ascii_digit), (Token{ typ: TokenType::DIGIT, pos: Position::from(&span), fragment: digits.to_string(), }) ) ); make_fn!(booleantok, do_each!( span => input!(), token => either!( text_token!("true"), text_token!("false") ), (Token{ typ: TokenType::BOOLEAN, pos: Position::from(&span), fragment: token.to_string(), }) ) ); /// do_text_token_tok! is a helper macro to make building a simple text_token token /// less code. macro_rules! do_text_token_tok { ($i:expr, $type:expr, $text_token:expr, WS) => { do_each!($i, span => input!(), frag => text_token!($text_token), _ => either!(whitespace, comment), (Token { typ: $type, pos: Position::from(&span), fragment: frag.to_string(), }) ) }; ($i:expr, $type:expr, $text_token:expr) => { do_each!($i, span => input!(), frag => text_token!($text_token), (Token { typ: $type, pos: Position::from(&span), fragment: frag.to_string(), }) ) }; } make_fn!(emptytok, do_text_token_tok!(TokenType::EMPTY, "NULL") ); make_fn!(commatok, do_text_token_tok!(TokenType::PUNCT, ",") ); make_fn!(lbracetok, do_text_token_tok!(TokenType::PUNCT, "{") ); make_fn!(rbracetok, do_text_token_tok!(TokenType::PUNCT, "}") ); make_fn!(lparentok, do_text_token_tok!(TokenType::PUNCT, "(") ); make_fn!(rparentok, do_text_token_tok!(TokenType::PUNCT, ")") ); make_fn!(dottok, do_text_token_tok!(TokenType::PUNCT, ".") ); make_fn!(plustok, do_text_token_tok!(TokenType::PUNCT, "+") ); make_fn!(dashtok, do_text_token_tok!(TokenType::PUNCT, "-") ); make_fn!(startok, do_text_token_tok!(TokenType::PUNCT, "*") ); make_fn!(slashtok, do_text_token_tok!(TokenType::PUNCT, "/") ); make_fn!(modulustok, do_text_token_tok!(TokenType::PUNCT, "%%") ); make_fn!(pcttok, do_text_token_tok!(TokenType::PUNCT, "%") ); make_fn!(eqeqtok, do_text_token_tok!(TokenType::PUNCT, "==") ); make_fn!(notequaltok, do_text_token_tok!(TokenType::PUNCT, "!=") ); make_fn!(matchtok, do_text_token_tok!(TokenType::PUNCT, "~") ); make_fn!(notmatchtok, do_text_token_tok!(TokenType::PUNCT, "!~") ); make_fn!(gttok, do_text_token_tok!(TokenType::PUNCT, ">") ); make_fn!(gtequaltok, do_text_token_tok!(TokenType::PUNCT, ">=") ); make_fn!(ltequaltok, do_text_token_tok!(TokenType::PUNCT, "<=") ); make_fn!(lttok, do_text_token_tok!(TokenType::PUNCT, "<") ); make_fn!(equaltok, do_text_token_tok!(TokenType::PUNCT, "=") ); make_fn!(semicolontok, do_text_token_tok!(TokenType::PUNCT, ";") ); make_fn!(colontok, do_text_token_tok!(TokenType::PUNCT, ":") ); make_fn!(leftsquarebracket, do_text_token_tok!(TokenType::PUNCT, "[") ); make_fn!(rightsquarebracket, do_text_token_tok!(TokenType::PUNCT, "]") ); make_fn!(fatcommatok, do_text_token_tok!(TokenType::PUNCT, "=>") ); make_fn!(andtok, do_text_token_tok!(TokenType::PUNCT, "&&") ); make_fn!(ortok, do_text_token_tok!(TokenType::PUNCT, "||") ); make_fn!(selecttok, do_text_token_tok!(TokenType::BAREWORD, "select", WS) ); make_fn!(intok, do_text_token_tok!(TokenType::BAREWORD, "in", WS) ); make_fn!(istok, do_text_token_tok!(TokenType::BAREWORD, "is", WS) ); make_fn!(nottok, do_text_token_tok!(TokenType::BAREWORD, "not", WS) ); make_fn!(failtok, do_text_token_tok!(TokenType::BAREWORD, "fail", WS) ); make_fn!(functok, do_text_token_tok!(TokenType::BAREWORD, "func", WS) ); make_fn!(moduletok, do_text_token_tok!(TokenType::BAREWORD, "module", WS) ); make_fn!(lettok, do_text_token_tok!(TokenType::BAREWORD, "let", WS) ); make_fn!(importtok, do_text_token_tok!(TokenType::BAREWORD, "import", WS) ); make_fn!(includetok, do_text_token_tok!(TokenType::BAREWORD, "include", WS) ); make_fn!(asserttok, do_text_token_tok!(TokenType::BAREWORD, "assert", WS) ); make_fn!(outtok, do_text_token_tok!(TokenType::BAREWORD, "out", WS) ); make_fn!(astok, do_text_token_tok!(TokenType::BAREWORD, "as", WS) ); make_fn!(maptok, do_text_token_tok!(TokenType::BAREWORD, "map", WS) ); make_fn!(filtertok, do_text_token_tok!(TokenType::BAREWORD, "filter", WS) ); make_fn!(reducetok, do_text_token_tok!(TokenType::BAREWORD, "reduce", WS) ); fn comment(input: OffsetStrIter) -> Result { match text_token!(input, "//") { Result::Complete(rest, _) => { match until!( rest, either!( eoi, discard!(text_token!("\r\n")), discard!(text_token!("\n")) ) ) { Result::Complete(rest, cmt) => { return Result::Complete(rest, make_tok!(CMT => cmt.to_string(), input)); } // If we didn't find a new line then we just grab everything. _ => { return Result::Abort(Error::new( "Unparsable comment".to_string(), Box::new(rest.clone()), )); } } } Result::Incomplete(ctx) => return Result::Incomplete(ctx), Result::Fail(e) => return Result::Fail(e), Result::Abort(e) => return Result::Abort(e), } } make_fn!(whitespace, do_each!( span => input!(), _ => peek!(ascii_ws), _ => repeat!(ascii_ws), (Token{ typ: TokenType::WS, pos: Position::from(&span), fragment: String::new(), }) ) ); make_fn!(end_of_input, do_each!( span => input!(), _ => eoi, (Token{ typ: TokenType::END, pos: Position::from(&span), fragment: String::new(), }) ) ); fn token<'a>(input: OffsetStrIter<'a>) -> Result, Token> { either!( input, strtok, emptytok, // This must come before the barewordtok digittok, commatok, rbracetok, lbracetok, lparentok, rparentok, dottok, andtok, ortok, plustok, dashtok, startok, comment, // Note comment must come before slashtok slashtok, modulustok, pcttok, eqeqtok, notequaltok, matchtok, notmatchtok, complete!("Not >=".to_string(), gtequaltok), complete!("Not <=".to_string(), ltequaltok), gttok, lttok, fatcommatok, // Note fatcommatok must come before equaltok equaltok, semicolontok, colontok, leftsquarebracket, rightsquarebracket, booleantok, intok, istok, nottok, lettok, outtok, selecttok, asserttok, failtok, functok, moduletok, importtok, includetok, astok, maptok, filtertok, reducetok, barewordtok, whitespace, end_of_input ) } /// Consumes an input OffsetStrIter and returns either a Vec or a error::Error. pub fn tokenize<'a>(input: OffsetStrIter<'a>) -> std::result::Result, String> { let mut out = Vec::new(); let mut i = input.clone(); loop { if let Result::Complete(_, _) = eoi(i.clone()) { break; } match token(i.clone()) { Result::Abort(e) => { let err = abortable_parser::Error::caused_by( "Invalid Token encountered", Box::new(e), Box::new(i.clone()), ); let ctx_err = StackPrinter { err: err }; return Err(format!("{}", ctx_err)); } Result::Fail(e) => { let err = abortable_parser::Error::caused_by( "Invalid Token encountered", Box::new(e), Box::new(i.clone()), ); let ctx_err = StackPrinter { err: err }; return Err(format!("{}", ctx_err)); } Result::Incomplete(_offset) => { let err = abortable_parser::Error::new("Invalid Token encountered", Box::new(i.clone())); let ctx_err = StackPrinter { err: err }; return Err(format!("{}", ctx_err)); } Result::Complete(rest, tok) => { i = rest; if tok.typ == TokenType::COMMENT || tok.typ == TokenType::WS { // we skip comments and whitespace continue; } out.push(tok); } } } // ensure that we always have an END token to go off of. out.push(Token { fragment: String::new(), typ: TokenType::END, pos: Position::from(&i), }); Ok(out) } /// Clones a token. /// /// This is necessary to allow the match_type and match_token macros to work. pub fn token_clone(t: &Token) -> std::result::Result>> { Ok(t.clone()) } /// nom macro that matches a Token by type and uses an optional conversion handler /// for the matched Token. macro_rules! match_type { ($i:expr,BOOLEAN => $h:expr) => { match_type!($i, TokenType::BOOLEAN, "Not a Boolean", $h) }; ($i:expr,BOOLEAN) => { match_type!($i, BOOLEAN => token_clone) }; ($i:expr,COMMENT => $h:expr) => { match_type!($i, TokenType::COMMENT, "Not a Comment", $h) }; ($i:expr,COMMENT) => { match_type!($i, COMMENT => token_clone) }; ($i:expr,BAREWORD => $h:expr) => { match_type!($i, TokenType::BAREWORD, "Not a Bareword", $h) }; ($i:expr,BAREWORD) => { match_type!($i, BAREWORD => token_clone) }; ($i:expr,EMPTY => $h:expr) => { match_type!($i, TokenType::EMPTY, "Not NULL", $h) }; ($i:expr,EMPTY) => { match_type!($i, EMPTY => token_clone) }; ($i:expr,STR => $h:expr) => { match_type!($i, TokenType::QUOTED, "Not a String", $h) }; ($i:expr,STR) => { match_type!($i, STR => token_clone) }; ($i:expr,DIGIT => $h:expr) => { match_type!($i, TokenType::DIGIT, "Not a DIGIT", $h) }; ($i:expr,DIGIT) => { match_type!($i, DIGIT => token_clone) }; ($i:expr,PUNCT => $h:expr) => { match_type!($i, TokenType::PUNCT, "Not PUNCTUATION", $h) }; ($i:expr,PUNCT) => { match_type!($i, PUNCT => token_clone) }; ($i:expr, $t:expr, $msg:expr, $h:expr) => {{ use abortable_parser::combinators::eoi; use abortable_parser::{Error, Result}; use std; let mut _i = $i.clone(); if eoi(_i.clone()).is_complete() { Result::Fail(Error::new(format!("End of Input! {}", $msg), Box::new(_i))) } else { match _i.next() { Some(tok) => { if tok.typ == $t { match $h(tok) { std::result::Result::Ok(v) => Result::Complete(_i.clone(), v), std::result::Result::Err(e) => { Result::Fail(Error::caused_by($msg, Box::new(e), Box::new(_i))) } } } else { Result::Fail(Error::new($msg.to_string(), Box::new($i))) } } None => Result::Fail(Error::new($msg.to_string(), Box::new($i))), } } }}; } /// nom style macro that matches various Tokens by type and value and allows optional /// conversion handlers for the matched Token. macro_rules! match_token { ($i:expr,PUNCT => $f:expr) => {{ use crate::tokenizer::token_clone; match_token!($i, PUNCT => $f, token_clone) }}; ($i:expr,PUNCT => $f:expr, $h:expr) => { match_token!($i, TokenType::PUNCT, $f, format!("({})", $f), $h) }; ($i:expr,BAREWORD => $f:expr) => {{ use crate::tokenizer::token_clone; match_token!($i, BAREWORD => $f, token_clone) }}; ($i:expr,BAREWORD => $f:expr, $h:expr) => { match_token!( $i, TokenType::BAREWORD, $f, format!("Expected BAREWORD but got ({})", $f), $h ) }; ($i:expr, $t:expr, $f:expr, $msg:expr, $h:expr) => {{ use abortable_parser::Result; use std; let mut i_ = $i.clone(); let tok = i_.next(); if let Some(tok) = tok { if tok.typ == $t && &tok.fragment == $f { match $h(tok) { std::result::Result::Ok(v) => Result::Complete(i_.clone(), v), std::result::Result::Err(e) => { Result::Fail(Error::caused_by($msg, Box::new(e), Box::new(i_))) } } } else { Result::Fail(Error::new( format!("Expected {} but got ({})", $msg, tok.fragment), Box::new($i.clone()), )) } } else { Result::Fail(Error::new("Unexpected End Of Input", Box::new(i_))) } }}; } /// nom style macro that matches punctuation Tokens. macro_rules! punct { ($i:expr, $c:expr) => { match_token!($i, PUNCT => $c) }; } /// nom style macro that matches any bareword Token. macro_rules! word { ($i:expr, $w:expr) => { match_token!($i, BAREWORD => $w) }; } /// pos gets the current position from a TokenIter input without consuming it. pub fn pos<'a>(i: SliceIter<'a, Token>) -> Result, Position> { let mut _i = i.clone(); let tok = _i.next().unwrap(); let pos = tok.pos.clone(); Result::Complete(i, pos) } #[cfg(test)] mod test;