/* * Copyright (c) 2014-2017, Eren Okka * Copyright (c) 2016-2017, Paul Miller * Copyright (c) 2017-2018, Tyler Bratton * * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at https://mozilla.org/MPL/2.0/. */ using System.Collections.Generic; using System.Linq; using System.Text.RegularExpressions; namespace AnitomySharp { /// /// Class to classify s /// /// 用于标记(token)分类的类 /// public class Parser { /// /// 用于确认元素是否已存在 /// public bool IsEpisodeKeywordsFound { get; private set; } /// /// /// public ParserHelper ParseHelper { get; } /// /// /// public ParserNumber ParseNumber { get; } /// /// 元素列表 /// public List Elements { get; } /// /// 标记列表 /// public List Tokens { get; } /// /// 提取元素时的配置项 /// private Options Options { get; } /// /// Constructs a new token parser /// /// 构造一个标记(token)解析 /// /// 并创建ParserHelper和ParserNumber各一个实例 /// /// the list where parsed elements will be added /// the parser options /// the list of tokens public Parser(List elements, Options options, List tokens) { Elements = elements; Options = options; Tokens = tokens; ParseHelper = new ParserHelper(this); ParseNumber = new ParserNumber(this); } /// /// Begins the parsing process /// /// 开始处理 /// /// public bool Parse() { SearchForKeywords(); SearchForIsolatedNumbers(); if (Options.ParseEpisodeNumber) { SearchForEpisodeNumber(); } SearchForAnimeTitle(); if (Options.ParseReleaseGroup && Empty(Element.ElementCategory.ElementReleaseGroup)) { SearchForReleaseGroup(); } if (Options.ParseEpisodeTitle && !Empty(Element.ElementCategory.ElementEpisodeNumber)) { SearchForEpisodeTitle(); } ValidateElements(); return Empty(Element.ElementCategory.ElementAnimeTitle); } /// /// Search for anime keywords. /// /// 主要是根据关键词列表匹配标记(token),并将匹配到的关键字添加到元素列表 /// private void SearchForKeywords() { for (var i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; /** 过滤已知标记类型的标记 */ if (token.Category != Token.TokenCategory.Unknown) continue; var word = token.Content; word = word.Trim(" -".ToCharArray()); if (string.IsNullOrEmpty(word)) continue; // Don't bother if the word is a number that cannot be CRC if (word.Length != 8 && StringHelper.IsNumericString(word)) continue; var keyword = KeywordManager.Normalize(word); var category = Element.ElementCategory.ElementUnknown; var keywordOptions = new KeywordOptions(); /** 首先在关键词列表中匹配关键词,如无则执行else */ if (KeywordManager.FindAndSet(keyword, ref category, ref keywordOptions)) { /** 根据配置跳过发布组元素 */ if (!Options.ParseReleaseGroup && category == Element.ElementCategory.ElementReleaseGroup) continue; /** 跳过配置为不能搜索的元素 */ if (!ParseHelper.IsElementCategorySearchable(category) || !keywordOptions.Searchable) continue; /** 跳过已经包含的Singular元素类别 */ if (ParseHelper.IsElementCategorySingular(category) && !Empty(category)) continue; switch (category) { case Element.ElementCategory.ElementAnimeSeasonPrefix: ParseHelper.CheckAndSetAnimeSeasonKeyword(token, i); continue; case Element.ElementCategory.ElementEpisodePrefix when keywordOptions.Valid: ParseHelper.CheckExtentKeyword(Element.ElementCategory.ElementEpisodeNumber, i, token); continue; case Element.ElementCategory.ElementReleaseVersion: word = word.Substring(1); break; case Element.ElementCategory.ElementVolumePrefix: ParseHelper.CheckExtentKeyword(Element.ElementCategory.ElementVolumeNumber, i, token); continue; } } else { /** 如果还不存在ElementFileChecksum元素类型,且该标记满足Crc32规则 */ if (Empty(Element.ElementCategory.ElementFileChecksum) && ParserHelper.IsCrc32(word)) { category = Element.ElementCategory.ElementFileChecksum; } /** 如果还不存在ElementVideoResolution元素类型,且该标记满足分辨率规则 */ else if (Empty(Element.ElementCategory.ElementVideoResolution) && ParserHelper.IsResolution(word)) { category = Element.ElementCategory.ElementVideoResolution; } } /** 如果此标记的元素分类仍为ElementUnknown,则跳过此标记的处理*/ if (category == Element.ElementCategory.ElementUnknown) continue; Elements.Add(new Element(category, word)); if (keywordOptions.Identifiable) { token.Category = Token.TokenCategory.Identifier; } } } /// /// Search for episode number. /// /// 匹配标记列表中的集数 /// private void SearchForEpisodeNumber() { var tokens = new List(); for (var i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; // List all unknown tokens that contain a number if (token.Category == Token.TokenCategory.Unknown && ParserHelper.IndexOfFirstDigit(token.Content) != -1) { tokens.Add(i); } } if (tokens.Count == 0) { // search Japanese Pattern without number if (Empty(Element.ElementCategory.ElementEpisodeNumber)) { for (var i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; if (token.Category == Token.TokenCategory.Unknown && ParserHelper.IndexOfFirstDigit(token.Content) == -1) { ParseNumber.MatchJapaneseCounterPattern(token.Content, token); } } } return; } IsEpisodeKeywordsFound = !Empty(Element.ElementCategory.ElementEpisodeNumber); // If a token matches a known episode pattern, it has to be the episode number if (ParseNumber.SearchForEpisodePatterns(tokens)) return; // We have previously found an episode number via keywords if (!Empty(Element.ElementCategory.ElementEpisodeNumber)) return; // From now on, we're only interested in numeric tokens tokens.RemoveAll(r => !StringHelper.IsNumericString(Tokens[r].Content)); // e.g. "01 (176)", "29 (04)" if (ParseNumber.SearchForEquivalentNumbers(tokens)) return; // e.g. " - 08" if (ParseNumber.SearchForSeparatedNumbers(tokens)) return; // "e.g. "[12]", "(2006)" if (ParseNumber.SearchForIsolatedNumbers(tokens)) return; // Consider using the last number as a last resort ParseNumber.SearchForLastNumber(tokens); } /// /// Search for anime title /// /// 搜索动画名 /// private void SearchForAnimeTitle() { var enclosedTitle = false; var tokenBegin = Token.FindToken(Tokens, 0, Tokens.Count, Token.TokenFlag.FlagNotEnclosed, Token.TokenFlag.FlagUnknown); // without ReleaseGroup, only anime title e.g. "[2005][Paniponi Dash!][BDRIP][1080P][1-26Fin+OVA+SP]" var tokenBeginWithNoReleaseGroup = Tokens.Count; // If that doesn't work, find the first unknown token in the second enclosed // group, assuming that the first one is the release group if (!Token.InListRange(tokenBegin, Tokens)) { tokenBegin = 0; enclosedTitle = true; var skippedPreviousGroup = false; do { tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown); tokenBeginWithNoReleaseGroup = tokenBegin; if (!Token.InListRange(tokenBegin, Tokens)) break; // Ignore groups that are composed of non-Latin characters or non-Chinese characters // 对于同时有中英文名称,并且两者分割开来,如:“[異域字幕組][漆黑的子彈][Black Bullet][11][1280x720][繁体].mp4”,则只会返回第一个匹配到的 if ((StringHelper.IsMostlyLatinString(Tokens[tokenBegin].Content) || StringHelper.IsMostlyChineseString(Tokens[tokenBegin].Content)) && skippedPreviousGroup) { break; } // if ReleaseGroup is empty if (Options.ParseReleaseGroup && Empty(Element.ElementCategory.ElementReleaseGroup)) { // Get the first unknown token of the next group tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket); tokenBegin = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagUnknown); } // make sure the new token don't in Element.ElementCategory // if in or outListRange // return pretoken // TODO match other ElementCategory if ((Token.InListRange(tokenBegin, Tokens) && KeywordManager.Contains(Element.ElementCategory.ElementAnimeType, Tokens[tokenBegin].Content.ToUpper())) || tokenBegin == Tokens.Count) { tokenBegin = tokenBeginWithNoReleaseGroup; } skippedPreviousGroup = true; } while (Token.InListRange(tokenBegin, Tokens)); } if (!Token.InListRange(tokenBegin, Tokens)) return; // Continue until an identifier (or a bracket, if the title is enclosed) is found var tokenEnd = Token.FindToken( Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagIdentifier, enclosedTitle ? Token.TokenFlag.FlagBracket : Token.TokenFlag.FlagNone); // If within the interval there's an open bracket without its matching pair, // move the upper endpoint back to the bracket if (!enclosedTitle) { var lastBracket = tokenEnd; var bracketOpen = false; for (var i = tokenBegin; i < tokenEnd; i++) { if (Tokens[i].Category != Token.TokenCategory.Bracket) continue; lastBracket = i; bracketOpen = !bracketOpen; } if (bracketOpen) tokenEnd = lastBracket; } // If the interval ends with an enclosed group (e.g. "Anime Title [Fansub]"), // move the upper endpoint back to the beginning of the group. We ignore // parentheses in order to keep certain groups (e.g. "(TV)") intact. if (!enclosedTitle) { var token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter); while (ParseHelper.IsTokenCategory(token, Token.TokenCategory.Bracket) && Tokens[token].Content[0] != ')') { token = Token.FindPrevToken(Tokens, token, Token.TokenFlag.FlagBracket); if (!Token.InListRange(token, Tokens)) continue; tokenEnd = token; token = Token.FindPrevToken(Tokens, tokenEnd, Token.TokenFlag.FlagNotDelimiter); } } ParseHelper.BuildElement(Element.ElementCategory.ElementAnimeTitle, false, Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin)); } /// /// Search for release group /// /// 搜索发布组 /// private void SearchForReleaseGroup() { for (int tokenBegin = 0, tokenEnd = tokenBegin; tokenBegin < Tokens.Count;) { // Find the first enclosed unknown token tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagEnclosed, Token.TokenFlag.FlagUnknown); if (!Token.InListRange(tokenBegin, Tokens)) return; // Continue until a bracket or identifier is found tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket, Token.TokenFlag.FlagIdentifier); // 去除纯数字发布组 if (Regex.Match(Tokens[tokenBegin].Content, ParserNumber.RegexMatchOnlyStart + @"^[0-9]+$" + ParserNumber.RegexMatchOnlyEnd).Success) continue; if (!Token.InListRange(tokenEnd, Tokens) || Tokens[tokenEnd].Category != Token.TokenCategory.Bracket) continue; // Ignore if it's not the first non-delimiter token in group var prevToken = Token.FindPrevToken(Tokens, tokenBegin, Token.TokenFlag.FlagNotDelimiter); if (Token.InListRange(prevToken, Tokens) && Tokens[prevToken].Category != Token.TokenCategory.Bracket) continue; ParseHelper.BuildElement(Element.ElementCategory.ElementReleaseGroup, true, Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin)); return; } } /// /// Search for episode title /// /// 搜索剧集标题 /// private void SearchForEpisodeTitle() { int tokenBegin; var tokenEnd = 0; do { // Find the first non-enclosed unknown token tokenBegin = Token.FindToken(Tokens, tokenEnd, Tokens.Count, Token.TokenFlag.FlagNotEnclosed, Token.TokenFlag.FlagUnknown); if (!Token.InListRange(tokenBegin, Tokens)) return; // Continue until a bracket or identifier is found tokenEnd = Token.FindToken(Tokens, tokenBegin, Tokens.Count, Token.TokenFlag.FlagBracket, Token.TokenFlag.FlagIdentifier); // Ignore if it's only a dash if (tokenEnd - tokenBegin <= 2 && ParserHelper.IsDashCharacter(Tokens[tokenBegin].Content[0])) continue; //if (tokenBegin.Pos == null || tokenEnd.Pos == null) continue; ParseHelper.BuildElement(Element.ElementCategory.ElementEpisodeTitle, false, Tokens.GetRange(tokenBegin, tokenEnd - tokenBegin)); return; } while (Token.InListRange(tokenBegin, Tokens)); } /// /// Search for isolated numbers /// /// 搜索孤立数字的处理逻辑 /// private void SearchForIsolatedNumbers() { for (var i = 0; i < Tokens.Count; i++) { var token = Tokens[i]; /** 跳过括号标记类型的标记 */ if (token.Category == Token.TokenCategory.Bracket) continue; var tokenContent = token.Content; // e.g. "2016-17" const string regexPattern = ParserNumber.RegexMatchOnlyStart + @"(\d{1,4})([-~&+])(\d{2,4})" + ParserNumber.RegexMatchOnlyEnd; var match = Regex.Match(token.Content, regexPattern); if (match.Success) { tokenContent = tokenContent.Split(match.Groups[2].Value)[0]; } // add newtype e.g. "2021 OVA" if (token.Category != Token.TokenCategory.Unknown || !StringHelper.IsNumericString(tokenContent) || !(ParseHelper.IsTokenContainAnimeType(i) ^ ParseHelper.IsTokenIsolated(i))) { continue; } var number = StringHelper.StringToInt(tokenContent); // Anime year if (number >= ParserNumber.AnimeYearMin && number <= ParserNumber.AnimeYearMax) { if (Empty(Element.ElementCategory.ElementAnimeYear)) { Elements.Add(new Element(Element.ElementCategory.ElementAnimeYear, token.Content)); token.Category = Token.TokenCategory.Identifier; continue; } } // Video resolution if (number != 480 && number != 720 && number != 1080 && number != 2160) continue; // If these numbers are isolated, it's more likely for them to be the // video resolution rather than the episode number. Some fansub groups use these without the "p" suffix. // if (!Empty(Element.ElementCategory.ElementVideoResolution)) continue; Elements.Add(new Element(Element.ElementCategory.ElementVideoResolution, token.Content)); token.Category = Token.TokenCategory.Identifier; } } /// /// Validate Elements /// /// 验证元素有效性 /// private void ValidateElements() { if (!Empty(Element.ElementCategory.ElementAnimeType) && !Empty(Element.ElementCategory.ElementEpisodeTitle)) { var episodeTitle = Get(Element.ElementCategory.ElementEpisodeTitle); for (var i = 0; i < Elements.Count;) { var el = Elements[i]; if (el.Category == Element.ElementCategory.ElementAnimeType) { if (episodeTitle.Contains(el.Value)) { if (episodeTitle.Length == el.Value.Length) { Elements.RemoveAll(element => element.Category == Element.ElementCategory.ElementEpisodeTitle); // invalid episode title } else { var keyword = KeywordManager.Normalize(el.Value); if (KeywordManager.Contains(Element.ElementCategory.ElementAnimeType, keyword)) { i = Erase(el); // invalid anime type continue; } } } } ++i; } } } /// /// Returns whether or not the parser contains this category /// /// 判断当前的元素列表是否包含传入的元素类别 /// /// /// 不包含则返回`true`,否则`false` private bool Empty(Element.ElementCategory category) { return Elements.All(element => element.Category != category); } /// /// Returns the value of a particular category /// /// 返回传入元素类别的值 /// /// /// private string Get(Element.ElementCategory category) { var foundElement = Elements.Find(element => element.Category == category); if (foundElement != null) return foundElement.Value; Element e = new Element(category, ""); Elements.Add(e); foundElement = e; return foundElement.Value; } /// /// Deletes the first element with the same element.Category and returns the deleted element's position. /// /// 删除第一个具有相同的元素 /// /// /// 返回被删除元素的位置 private int Erase(Element element) { var removedIdx = -1; for (var i = 0; i < Elements.Count; i++) { var currentElement = Elements[i]; if (element.Category != currentElement.Category) continue; removedIdx = i; Elements.RemoveAt(i); break; } return removedIdx; } } }