// This is written to exactly parse the `font` shorthand in CSS2: // https://www.w3.org/TR/CSS22/fonts.html#font-shorthand // https://www.w3.org/TR/CSS22/syndata.html#tokenization // // We may want to update it for CSS 3 (e.g. font-stretch, or updated // tokenization) but I've only ever seen one or two issues filed in node-canvas // due to parsing in my 8 years on the project #include "FontParser.h" #include "CharData.h" #include #include Token::Token(Type type, std::string value) : type_(type), value_(std::move(value)) {} Token::Token(Type type, double value) : type_(type), value_(value) {} Token::Token(Type type) : type_(type), value_(std::string{}) {} const std::string& Token::getString() const { static const std::string empty; auto* str = std::get_if(&value_); return str ? *str : empty; } double Token::getNumber() const { auto* num = std::get_if(&value_); return num ? *num : 0.0f; } Tokenizer::Tokenizer(std::string_view input) : input_(input) {} std::string Tokenizer::utf8Encode(uint32_t codepoint) { std::string result; if (codepoint < 0x80) { result += static_cast(codepoint); } else if (codepoint < 0x800) { result += static_cast((codepoint >> 6) | 0xc0); result += static_cast((codepoint & 0x3f) | 0x80); } else if (codepoint < 0x10000) { result += static_cast((codepoint >> 12) | 0xe0); result += static_cast(((codepoint >> 6) & 0x3f) | 0x80); result += static_cast((codepoint & 0x3f) | 0x80); } else { result += static_cast((codepoint >> 18) | 0xf0); result += static_cast(((codepoint >> 12) & 0x3f) | 0x80); result += static_cast(((codepoint >> 6) & 0x3f) | 0x80); result += static_cast((codepoint & 0x3f) | 0x80); } return result; } char Tokenizer::peek() const { return position_ < input_.length() ? input_[position_] : '\0'; } char Tokenizer::advance() { return position_ < input_.length() ? input_[position_++] : '\0'; } Token Tokenizer::parseNumber() { enum class State { Start, AfterSign, Digits, AfterDecimal, AfterE, AfterESign, ExponentDigits }; size_t start = position_; size_t ePosition = 0; State state = State::Start; bool valid = false; while (position_ < input_.length()) { char c = peek(); uint8_t flags = charData[static_cast(c)]; switch (state) { case State::Start: if (flags & CharData::Sign) { position_++; state = State::AfterSign; } else if (flags & CharData::Digit) { position_++; state = State::Digits; valid = true; } else if (c == '.') { position_++; state = State::AfterDecimal; } else { goto done; } break; case State::AfterSign: if (flags & CharData::Digit) { position_++; state = State::Digits; valid = true; } else if (c == '.') { position_++; state = State::AfterDecimal; } else { goto done; } break; case State::Digits: if (flags & CharData::Digit) { position_++; } else if (c == '.') { position_++; state = State::AfterDecimal; } else if (c == 'e' || c == 'E') { ePosition = position_; position_++; state = State::AfterE; valid = false; } else { goto done; } break; case State::AfterDecimal: if (flags & CharData::Digit) { position_++; valid = true; state = State::Digits; } else { goto done; } break; case State::AfterE: if (flags & CharData::Sign) { position_++; state = State::AfterESign; } else if (flags & CharData::Digit) { position_++; valid = true; state = State::ExponentDigits; } else { position_ = ePosition; valid = true; goto done; } break; case State::AfterESign: if (flags & CharData::Digit) { position_++; valid = true; state = State::ExponentDigits; } else { position_ = ePosition; valid = true; goto done; } break; case State::ExponentDigits: if (flags & CharData::Digit) { position_++; } else { goto done; } break; } } done: if (!valid) { position_ = start; return Token(Token::Type::Invalid); } std::string number_str(input_.substr(start, position_ - start)); double value = std::stod(number_str); return Token(Token::Type::Number, value); } // Note that identifiers are always lower-case. This helps us make easier/more // efficient comparisons, but means that font-families specified as identifiers // will be lower-cased. Since font selection isn't case sensitive, this // shouldn't ever be a problem. Token Tokenizer::parseIdentifier() { std::string identifier; auto flags = CharData::Nmstart; auto start = position_; while (position_ < input_.length()) { char c = peek(); if (c == '\\') { advance(); if (!parseEscape(identifier)) { position_ = start; return Token(Token::Type::Invalid); } flags = CharData::Nmchar; } else if (charData[static_cast(c)] & flags) { identifier += advance() + (c >= 'A' && c <= 'Z' ? 32 : 0); flags = CharData::Nmchar; } else { break; } } return Token(Token::Type::Identifier, identifier); } uint32_t Tokenizer::parseUnicode() { uint32_t value = 0; size_t count = 0; while (position_ < input_.length() && count < 6) { char c = peek(); uint32_t digit; if (c >= '0' && c <= '9') { digit = c - '0'; } else if (c >= 'a' && c <= 'f') { digit = c - 'a' + 10; } else if (c >= 'A' && c <= 'F') { digit = c - 'A' + 10; } else { break; } value = value * 16 + digit; advance(); count++; } // Optional whitespace after hex escape char c = peek(); if (c == '\r') { advance(); if (peek() == '\n') advance(); } else if (isWhitespace(c)) { advance(); } return value; } bool Tokenizer::parseEscape(std::string& str) { char c = peek(); auto flags = charData[static_cast(c)]; if (flags & CharData::Hex) { str += utf8Encode(parseUnicode()); return true; } else if (!(flags & CharData::Newline) && !(flags & CharData::Hex)) { str += advance(); return true; } return false; } Token Tokenizer::parseString(char quote) { advance(); std::string value; auto start = position_; while (position_ < input_.length()) { char c = peek(); if (c == quote) { advance(); return Token(Token::Type::QuotedString, value); } else if (c == '\\') { advance(); c = peek(); if (c == '\r') { advance(); if (peek() == '\n') advance(); } else if (isNewline(c)) { advance(); } else { if (!parseEscape(value)) { position_ = start; return Token(Token::Type::Invalid); } } } else { value += advance(); } } position_ = start; return Token(Token::Type::Invalid); } Token Tokenizer::nextToken() { if (position_ >= input_.length()) { return Token(Token::Type::EndOfInput); } char c = peek(); auto flags = charData[static_cast(c)]; if (isWhitespace(c)) { std::string whitespace; while (position_ < input_.length() && isWhitespace(peek())) { whitespace += advance(); } return Token(Token::Type::Whitespace, whitespace); } if (flags & CharData::NumStart) { Token token = parseNumber(); if (token.type() != Token::Type::Invalid) return token; } if (flags & CharData::Nmstart) { Token token = parseIdentifier(); if (token.type() != Token::Type::Invalid) return token; } if (c == '"') { Token token = parseString('"'); if (token.type() != Token::Type::Invalid) return token; } if (c == '\'') { Token token = parseString('\''); if (token.type() != Token::Type::Invalid) return token; } switch (advance()) { case '/': return Token(Token::Type::Slash); case ',': return Token(Token::Type::Comma); case '%': return Token(Token::Type::Percent); default: return Token(Token::Type::Invalid); } } FontParser::FontParser(std::string_view input) : tokenizer_(input) , currentToken_(tokenizer_.nextToken()) , nextToken_(tokenizer_.nextToken()) {} const std::unordered_map FontParser::weightMap = { {"normal", 400}, {"bold", 700}, {"lighter", 100}, {"bolder", 700} }; const std::unordered_map FontParser::unitMap = { {"cm", 37.8f}, {"mm", 3.78f}, {"in", 96.0f}, {"pt", 96.0f / 72.0f}, {"pc", 96.0f / 6.0f}, {"em", 16.0f}, {"px", 1.0f} }; void FontParser::advance() { currentToken_ = nextToken_; nextToken_ = tokenizer_.nextToken(); } void FontParser::skipWs() { while (currentToken_.type() == Token::Type::Whitespace) advance(); } bool FontParser::check(Token::Type type) const { return currentToken_.type() == type; } bool FontParser::checkWs() const { return nextToken_.type() == Token::Type::Whitespace || nextToken_.type() == Token::Type::EndOfInput; } bool FontParser::parseFontStyle(FontProperties& props) { if (check(Token::Type::Identifier)) { const auto& value = currentToken_.getString(); if (value == "italic") { props.fontStyle = FontStyle::Italic; advance(); return true; } else if (value == "oblique") { props.fontStyle = FontStyle::Oblique; advance(); return true; } else if (value == "normal") { props.fontStyle = FontStyle::Normal; advance(); return true; } } return false; } bool FontParser::parseFontVariant(FontProperties& props) { if (check(Token::Type::Identifier)) { const auto& value = currentToken_.getString(); if (value == "small-caps") { props.fontVariant = FontVariant::SmallCaps; advance(); return true; } else if (value == "normal") { props.fontVariant = FontVariant::Normal; advance(); return true; } } return false; } bool FontParser::parseFontWeight(FontProperties& props) { if (check(Token::Type::Number)) { double weightFloat = currentToken_.getNumber(); int weight = static_cast(weightFloat); if (weight < 1 || weight > 1000) return false; props.fontWeight = static_cast(weight); advance(); return true; } else if (check(Token::Type::Identifier)) { const auto& value = currentToken_.getString(); if (auto it = weightMap.find(value); it != weightMap.end()) { props.fontWeight = it->second; advance(); return true; } } return false; } bool FontParser::parseFontSize(FontProperties& props) { if (!check(Token::Type::Number)) return false; props.fontSize = currentToken_.getNumber(); advance(); double multiplier = 1.0f; if (check(Token::Type::Identifier)) { const auto& unit = currentToken_.getString(); if (auto it = unitMap.find(unit); it != unitMap.end()) { multiplier = it->second; advance(); } else { return false; } } else if (check(Token::Type::Percent)) { multiplier = 16.0f / 100.0f; advance(); } else { return false; } // Technically if we consumed some tokens but couldn't parse the font-size, // we should rewind the tokenizer, but I don't think the grammar allows for // any valid alternates in this specific case props.fontSize *= multiplier; return true; } // line-height is not used by canvas ever, but should still parse bool FontParser::parseLineHeight(FontProperties& props) { if (check(Token::Type::Slash)) { advance(); skipWs(); if (check(Token::Type::Number)) { advance(); if (check(Token::Type::Percent)) { advance(); } else if (check(Token::Type::Identifier)) { auto identifier = currentToken_.getString(); if (auto it = unitMap.find(identifier); it != unitMap.end()) { advance(); } else { return false; } } else { return false; } } else if (check(Token::Type::Identifier) && currentToken_.getString() == "normal") { advance(); } else { return false; } } return true; } bool FontParser::parseFontFamily(FontProperties& props) { while (!check(Token::Type::EndOfInput)) { std::string family = ""; std::string trailingWs = ""; bool found = false; while ( check(Token::Type::QuotedString) || check(Token::Type::Identifier) || check(Token::Type::Whitespace) ) { if (check(Token::Type::Whitespace)) { if (found) trailingWs += currentToken_.getString(); } else { // Identifier, QuotedString if (found) { family += trailingWs; trailingWs.clear(); } family += currentToken_.getString(); found = true; } advance(); } if (!found) return false; // only whitespace or non-id/string found props.fontFamily.push_back(family); if (check(Token::Type::Comma)) advance(); } return true; } FontProperties FontParser::parse(const std::string& fontString, bool* success) { FontParser parser(fontString); auto result = parser.parseFont(); if (success) *success = !parser.hasError_; return result; } FontProperties FontParser::parseFont() { FontProperties props; uint8_t state = 0b111; skipWs(); for (size_t i = 0; i < 3 && checkWs(); i++) { if ((state & 0b001) && parseFontStyle(props)) { state &= 0b110; goto match; } if ((state & 0b010) && parseFontVariant(props)) { state &= 0b101; goto match; } if ((state & 0b100) && parseFontWeight(props)) { state &= 0b011; goto match; } break; // all attempts exhausted match: skipWs(); // success: move to the next non-ws token } if (parseFontSize(props)) { skipWs(); if (parseLineHeight(props) && parseFontFamily(props)) { return props; } } hasError_ = true; return props; }