/* * Copyright (c) 2014-2017, Eren Okka * Copyright (c) 2016-2017, Paul Miller * Copyright (c) 2017-2018, Tyler Bratton * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace AnitomySharp { /// /// A class that will tokenize an anime filename. /// /// 用于动画文件名标记化的分词器 /// public class Tokenizer { /// /// 用于标记化的文件名 /// private readonly string _filename; /// /// 用于添加预处理后标记(token)的元素列表 /// private readonly List _elements; /// /// 用于解析的配置 /// private readonly Options _options; /// /// 用于存储标记(token)的列表 /// private readonly List _tokens; /// /// 括号列表 /// private static readonly List> Brackets = new List> { new Tuple("(", ")"), // U+0028-U+0029 new Tuple("[", "]"), // U+005B-U+005D Square bracket new Tuple("{", "}"), // U+007B-U+007D Curly bracket new Tuple("\u300C", "\u300D"), // Corner bracket 「」 new Tuple("\u300E", "\u300F"), // White corner bracket 『 』 new Tuple("\u3010", "\u3011"), // Black lenticular bracket 【】 new Tuple("\u3014", "\u3015"), // Black lenticular bracket 〔 〕 new Tuple("\u3016", "\u3017"), // Black lenticular bracket 〖 〗 new Tuple("\uFF08", "\uFF09"), // Fullwidth parenthesis ( ) new Tuple("\uFF3B", "\uFF3D"), // Fullwidth parenthesis [ ] new Tuple("\uFF5B", "\uFF5D") // Fullwidth parenthesis { } }; /// /// Tokenize a filename into s /// /// 将传入的文件名标记化,拆分为单个元素 /// /// /// the filename /// the list of elements where pre-identified tokens will be added /// the parser options /// the list of tokens where tokens will be added public Tokenizer(string filename, List elements, Options options, List tokens) { _filename = filename; _elements = elements; _options = options; _tokens = tokens; } /// /// Returns true if tokenization was successful; false otherwise. /// /// 按照括号列表执行分词,根据大小判断是否标记化成功。成功返回true;否则为false。 /// /// public bool Tokenize() { TokenizeByBrackets(); return _tokens.Count > 0; } /// /// Adds a token to the internal list of tokens /// /// 添加标记(token)至_tokens列表 /// /// the token category /// whether or not the token is enclosed in braces /// the token range private void AddToken(Token.TokenCategory category, bool enclosed, TokenRange range) { _tokens.Add(new Token(category, enclosed, StringHelper.SubstringWithCheck(_filename, range.Offset, range.Size))); } /// /// 根据分隔符配置,提取当前字符串范围内出现过的分隔符 /// /// /// private string GetDelimiters(TokenRange range) { var delimiters = new StringBuilder(); bool IsDelimiter(char c) { /** alphanumeric不属于分隔符 */ if (StringHelper.IsAlphanumericChar(c)) return false; return _options.AllowedDelimiters.Contains(c.ToString()) && !delimiters.ToString().Contains(c.ToString()); } foreach (var i in Enumerable.Range(range.Offset, Math.Min(_filename.Length, range.Offset + range.Size) - range.Offset) .Where(value => IsDelimiter(_filename[value]))) { delimiters.Append(_filename[i]); } return delimiters.ToString(); } /// /// Tokenize by bracket. /// /// 使用括号列表规则进行分词 /// /// 括号总是成对出现。将括号作为停用符,将文件名划为多块 private void TokenizeByBrackets() { /** 匹配到的(右)括号类型 */ string matchingBracket = null; /** 返回范围内第一个(左)括号位置 */ int FindFirstBracket(int start, int end) { for (var i = start; i < end; i++) { foreach (var bracket in Brackets) { /** 和括号列表中每对的第一个括号进行比较 */ if (!_filename[i].Equals(char.Parse(bracket.Item1))) continue; matchingBracket = bracket.Item2; return i; } } return -1; } /** 括号是否闭合 */ var isBracketOpen = false; for (var i = 0; i < _filename.Length;) { /**用于后续分词的终止位置,其逻辑为: 1. 如果括号未闭合(isBracketOpen = false),使用结果1:获得第一个(左)括号位置 2. 如果括号闭合(isBracketOpen = true),使用结果2:查找上一次匹配到的(右)括号(matchingBracket)的位置 */ var foundIdx = !isBracketOpen ? FindFirstBracket(i, _filename.Length) : _filename.IndexOf(matchingBracket, i, StringComparison.Ordinal); /** 1. 非括号起始至第一个左括号 2. 左括号右边至右括号左边 3. 最后一个右括号至末尾 */ var range = new TokenRange(i, foundIdx == -1 ? _filename.Length : foundIdx - i); if (range.Size > 0) { // Check if our range contains any known anime identifiers TokenizeByPreidentified(isBracketOpen, range); } if (foundIdx != -1) { // mark as bracket 标记为括号,并添加到_tokens列表 AddToken(Token.TokenCategory.Bracket, true, new TokenRange(range.Offset + range.Size, 1)); /** 括号是否闭合 取反 */ isBracketOpen = !isBracketOpen; i = foundIdx + 1; } else { break; } } } /// /// Tokenize by looking for known anime identifiers /// /// 根据已知的动画关键词列表来分词 /// /// whether or not the current range is enclosed in braces. 当前范围是否位于闭合的括号中。 /// the token range 标记的范围 private void TokenizeByPreidentified(bool enclosed, TokenRange range) { var preidentifiedTokens = new List(); // Find known anime identifiers KeywordManager.PeekAndAdd(_filename, range, _elements, preidentifiedTokens); var offset = range.Offset; var subRange = new TokenRange(range.Offset, 0); while (offset < range.Offset + range.Size) { foreach (var preidentifiedToken in preidentifiedTokens) { if (offset != preidentifiedToken.Offset) continue; if (subRange.Size > 0) { TokenizeByDelimiters(enclosed, subRange); } AddToken(Token.TokenCategory.Identifier, enclosed, preidentifiedToken); /** subRange偏移量移至此token后 */ subRange.Offset = preidentifiedToken.Offset + preidentifiedToken.Size; offset = subRange.Offset - 1; // It's going to be incremented below } /** 1. 如果这段字符串最后一个字符属于预定义标识(keyword),则Size为0 2. 否则,Size大于0 */ subRange.Size = ++offset - subRange.Offset; } // Either there was no preidentified token range, or we're now about to process the tail of our current range /** 1. 没有预定义标识的范围 2. 处理当前范围剩余部分 */ if (subRange.Size > 0) { TokenizeByDelimiters(enclosed, subRange); } } /// /// Tokenize by delimiters allowed in .AllowedDelimiters. /// /// 使用提取元素时的分隔符配置进行分词 /// /// whether or not the current range is enclosed in braces /// the token range private void TokenizeByDelimiters(bool enclosed, TokenRange range) { var delimiters = GetDelimiters(range); /** 如果这段字符串无分隔符,则整个作为Unknown类型的标记(token) */ if (string.IsNullOrEmpty(delimiters)) { AddToken(Token.TokenCategory.Unknown, enclosed, range); return; } for (int i = range.Offset, end = range.Offset + range.Size; i < end;) { var found = Enumerable.Range(i, Math.Min(end, _filename.Length) - i) .Where(c => delimiters.Contains(_filename[c].ToString())) .DefaultIfEmpty(end) .FirstOrDefault(); var subRange = new TokenRange(i, found - i); if (subRange.Size > 0) { /** 分隔符分割后的字符串作为Unknown类型的标记(token) */ AddToken(Token.TokenCategory.Unknown, enclosed, subRange); } if (found != end) { /** 分隔符作为Delimiter类型的标记(token) */ AddToken(Token.TokenCategory.Delimiter, enclosed, new TokenRange(subRange.Offset + subRange.Size, 1)); i = found + 1; } else { break; } } ValidateDelimiterTokens(); } /// /// Validates tokens (make sure certain words delimited by certain tokens aren't split) /// /// 验证标记,确保由配置的分隔符提取标记(token)时不会将有意义的单词拆分 /// private void ValidateDelimiterTokens() { bool IsDelimiterToken(int it) { return Token.InListRange(it, _tokens) && _tokens[it].Category == Token.TokenCategory.Delimiter; } bool IsUnknownToken(int it) { return Token.InListRange(it, _tokens) && _tokens[it].Category == Token.TokenCategory.Unknown; } bool IsSingleCharacterToken(int it) { return IsUnknownToken(it) && _tokens[it].Content.Length == 1 && _tokens[it].Content[0] != '-'; } void AppendTokenTo(Token src, Token dest) { dest.Content += src.Content; src.Category = Token.TokenCategory.Invalid; } for (var i = 0; i < _tokens.Count; i++) { var token = _tokens[i]; if (token.Category != Token.TokenCategory.Delimiter) continue; var delimiter = token.Content[0]; var prevToken = Token.FindPrevToken(_tokens, i, Token.TokenFlag.FlagValid); var nextToken = Token.FindNextToken(_tokens, i, Token.TokenFlag.FlagValid); // Check for single-character tokens to prevent splitting group names, // keywords, episode numbers, etc. if (delimiter != ' ' && delimiter != '_') { // Single character token if (IsSingleCharacterToken(prevToken)) { AppendTokenTo(token, _tokens[prevToken]); while (IsUnknownToken(nextToken)) { AppendTokenTo(_tokens[nextToken], _tokens[prevToken]); nextToken = Token.FindNextToken(_tokens, i, Token.TokenFlag.FlagValid); if (!IsDelimiterToken(nextToken) || _tokens[nextToken].Content[0] != delimiter) continue; AppendTokenTo(_tokens[nextToken], _tokens[prevToken]); nextToken = Token.FindNextToken(_tokens, nextToken, Token.TokenFlag.FlagValid); } continue; } if (IsSingleCharacterToken(nextToken)) { AppendTokenTo(token, _tokens[prevToken]); AppendTokenTo(_tokens[nextToken], _tokens[prevToken]); continue; } } // Check for adjacent delimiters if (IsUnknownToken(prevToken) && IsDelimiterToken(nextToken)) { var nextDelimiter = _tokens[nextToken].Content[0]; if (delimiter != nextDelimiter && delimiter != ',') { if (nextDelimiter == ' ' || nextDelimiter == '_') { AppendTokenTo(token, _tokens[prevToken]); } } } else if (IsDelimiterToken(prevToken) && IsDelimiterToken(nextToken)) { var prevDelimiter = _tokens[prevToken].Content[0]; var nextDelimiter = _tokens[nextToken].Content[0]; if (prevDelimiter == nextDelimiter && prevDelimiter != delimiter) { token.Category = Token.TokenCategory.Unknown; // e.g. "& in "_&_" } } // Check for other special cases if (delimiter != '&' && delimiter != '+') continue; if (!IsUnknownToken(prevToken) || !IsUnknownToken(nextToken)) continue; if (!StringHelper.IsNumericString(_tokens[prevToken].Content) || !StringHelper.IsNumericString(_tokens[nextToken].Content)) continue; AppendTokenTo(token, _tokens[prevToken]); AppendTokenTo(_tokens[nextToken], _tokens[prevToken]); // e.g. 01+02 } // Remove invalid tokens _tokens.RemoveAll(token => token.Category == Token.TokenCategory.Invalid); } } }