Private GIT

Skip to content
Snippets Groups Projects
Commit 19807d01 authored by Luca's avatar Luca
Browse files

GuessIt Libraru Update to version 06b1

parent 132c7562
No related branches found
No related tags found
No related merge requests found
......@@ -18,8 +18,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
__version__ = '0.6-dev'
__version__ = '0.6b1'
__all__ = ['Guess', 'Language',
'guess_file_info', 'guess_video_info',
'guess_movie_info', 'guess_episode_info']
......@@ -73,6 +74,7 @@ else:
from guessit.guess import Guess, merge_all
from guessit.language import Language
from guessit.matcher import IterativeMatcher
from guessit.textutils import clean_string
import logging
log = logging.getLogger(__name__)
......@@ -88,6 +90,86 @@ h = NullHandler()
log.addHandler(h)
def _guess_filename(filename, filetype):
mtree = IterativeMatcher(filename, filetype=filetype)
m = mtree.matched()
if 'language' not in m and 'subtitleLanguage' not in m:
return m
# if we found some language, make sure we didn't cut a title or sth...
mtree2 = IterativeMatcher(filename, filetype=filetype,
opts=['nolanguage', 'nocountry'])
m2 = mtree2.matched()
def find_nodes(tree, props):
"""Yields all nodes containing any of the given props."""
if isinstance(props, base_text_type):
props = [props]
for node in tree.nodes():
if any(prop in node.guess for prop in props):
yield node
def warning(title):
log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string()))
return m
if m.get('title') != m2.get('title'):
title = next(find_nodes(mtree.match_tree, 'title'))
title2 = next(find_nodes(mtree2.match_tree, 'title'))
langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage']))
if not langs:
return warning('A weird error happened with language detection')
# find the language that is likely more relevant
for lng in langs:
if lng.value in title2.value:
# if the language was detected as part of a potential title,
# look at this one in particular
lang = lng
break
else:
# pick the first one if we don't have a better choice
lang = langs[0]
# language code are rarely part of a title, and those
# should be handled by the Language exceptions anyway
if len(lang.value) <= 3:
return m
# if filetype is subtitle and the language appears last, just before
# the extension, then it is likely a subtitle language
parts = clean_string(title.root.value).split()
if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and
parts.index(lang.value) == len(parts) - 2):
return m
# if the language was in the middle of the other potential title,
# keep the other title (eg: The Italian Job), except if it is at the
# very beginning, in which case we consider it an error
if m2['title'].startswith(lang.value):
return m
elif lang.value in title2.value:
return m2
# if a node is in an explicit group, then the correct title is probably
# the other one
if title.root.node_at(title.node_idx[:2]).is_explicit():
return m2
elif title2.root.node_at(title2.node_idx[:2]).is_explicit():
return m
return warning('Not sure of the title because of the language position')
return m
def guess_file_info(filename, filetype, info=None):
"""info can contain the names of the various plugins, such as 'filename' to
detect filename info, or 'hash_md5' to get the md5 hash of the file.
......@@ -109,8 +191,7 @@ def guess_file_info(filename, filetype, info=None):
for infotype in info:
if infotype == 'filename':
m = IterativeMatcher(filename, filetype=filetype)
result.append(m.matched())
result.append(_guess_filename(filename, filetype))
elif infotype == 'hash_mpc':
from guessit.hash_mpc import hash_file
......@@ -164,7 +245,7 @@ def guess_file_info(filename, filetype, info=None):
# last minute adjustments
# if country is in the guessed properties, make it part of the filename
if 'country' in result:
if 'series' in result and 'country' in result:
result['series'] += ' (%s)' % result['country'].alpha2.upper()
......
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# GuessIt - A library for guessing information from filenames
# Copyright (c) 2011 Nicolas Wack <wackou@gmail.com>
#
# GuessIt is free software; you can redistribute it and/or modify it under
# the terms of the Lesser GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# GuessIt is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Lesser GNU General Public License for more details.
#
# You should have received a copy of the Lesser GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
from __future__ import unicode_literals
from __future__ import print_function
from guessit import u
from guessit import slogging, guess_file_info
from optparse import OptionParser
import logging
def detect_filename(filename, filetype, info=['filename']):
filename = u(filename)
print('For:', filename)
print('GuessIt found:', guess_file_info(filename, filetype, info).nice_string())
def run_demo(episodes=True, movies=True):
# NOTE: tests should not be added here but rather in the tests/ folder
# this is just intended as a quick example
if episodes:
testeps = [ 'Series/Californication/Season 2/Californication.2x05.Vaginatown.HDTV.XviD-0TV.[tvu.org.ru].avi',
'Series/dexter/Dexter.5x02.Hello,.Bandit.ENG.-.sub.FR.HDTV.XviD-AlFleNi-TeaM.[tvu.org.ru].avi',
'Series/Treme/Treme.1x03.Right.Place,.Wrong.Time.HDTV.XviD-NoTV.[tvu.org.ru].avi',
'Series/Duckman/Duckman - 101 (01) - 20021107 - I, Duckman.avi',
'Series/Duckman/Duckman - S1E13 Joking The Chicken (unedited).avi',
'Series/Simpsons/The_simpsons_s13e18_-_i_am_furious_yellow.mpg',
'Series/Simpsons/Saison 12 Français/Simpsons,.The.12x08.A.Bas.Le.Sergent.Skinner.FR.[tvu.org.ru].avi',
'Series/Dr._Slump_-_002_DVB-Rip_Catalan_by_kelf.avi',
'Series/Kaamelott/Kaamelott - Livre V - Second Volet - HD 704x396 Xvid 2 pass - Son 5.1 - TntRip by Slurm.avi'
]
for f in testeps:
print('-'*80)
detect_filename(f, filetype='episode')
if movies:
testmovies = [ 'Movies/Fear and Loathing in Las Vegas (1998)/Fear.and.Loathing.in.Las.Vegas.720p.HDDVD.DTS.x264-ESiR.mkv',
'Movies/El Dia de la Bestia (1995)/El.dia.de.la.bestia.DVDrip.Spanish.DivX.by.Artik[SEDG].avi',
'Movies/Blade Runner (1982)/Blade.Runner.(1982).(Director\'s.Cut).CD1.DVDRip.XviD.AC3-WAF.avi',
'Movies/Dark City (1998)/Dark.City.(1998).DC.BDRip.720p.DTS.X264-CHD.mkv',
'Movies/Sin City (BluRay) (2005)/Sin.City.2005.BDRip.720p.x264.AC3-SEPTiC.mkv',
'Movies/Borat (2006)/Borat.(2006).R5.PROPER.REPACK.DVDRip.XviD-PUKKA.avi', # FIXME: PROPER and R5 get overwritten
'[XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv', # FIXME: title gets overwritten
'Battle Royale (2000)/Battle.Royale.(Batoru.Rowaiaru).(2000).(Special.Edition).CD1of2.DVDRiP.XviD-[ZeaL].avi',
'Movies/Brazil (1985)/Brazil_Criterion_Edition_(1985).CD2.English.srt',
'Movies/Persepolis (2007)/[XCT] Persepolis [H264+Aac-128(Fr-Eng)+ST(Fr-Eng)+Ind].mkv',
'Movies/Toy Story (1995)/Toy Story [HDTV 720p English-Spanish].mkv',
'Movies/Pirates of the Caribbean: The Curse of the Black Pearl (2003)/Pirates.Of.The.Carribean.DC.2003.iNT.DVDRip.XviD.AC3-NDRT.CD1.avi',
'Movies/Office Space (1999)/Office.Space.[Dual-DVDRip].[Spanish-English].[XviD-AC3-AC3].[by.Oswald].avi',
'Movies/The NeverEnding Story (1984)/The.NeverEnding.Story.1.1984.DVDRip.AC3.Xvid-Monteque.avi',
'Movies/Juno (2007)/Juno KLAXXON.avi',
'Movies/Chat noir, chat blanc (1998)/Chat noir, Chat blanc - Emir Kusturica (VO - VF - sub FR - Chapters).mkv',
'Movies/Wild Zero (2000)/Wild.Zero.DVDivX-EPiC.srt',
'Movies/El Bosque Animado (1987)/El.Bosque.Animado.[Jose.Luis.Cuerda.1987].[Xvid-Dvdrip-720x432].avi',
'testsmewt_bugs/movies/Baraka_Edition_Collector.avi'
]
for f in testmovies:
print('-'*80)
detect_filename(f, filetype = 'movie')
def main():
slogging.setupLogging()
parser = OptionParser(usage = 'usage: %prog [options] file1 [file2...]')
parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False,
help = 'display debug output')
parser.add_option('-i', '--info', dest = 'info', default = 'filename',
help = 'the desired information type: filename, hash_mpc or a hash from python\'s '
'hashlib module, such as hash_md5, hash_sha1, ...; or a list of any of '
'them, comma-separated')
parser.add_option('-t', '--type', dest = 'filetype', default = 'autodetect',
help = 'the suggested file type: movie, episode or autodetect')
parser.add_option('-d', '--demo', action='store_true', dest='demo', default=False,
help = 'run a few builtin tests instead of analyzing a file')
options, args = parser.parse_args()
if options.verbose:
logging.getLogger('guessit').setLevel(logging.DEBUG)
if options.demo:
run_demo(episodes=True, movies=True)
else:
if args:
for filename in args:
detect_filename(filename,
filetype = options.filetype,
info = options.info.split(','))
else:
parser.print_help()
if __name__ == '__main__':
main()
......@@ -22,6 +22,7 @@ from __future__ import unicode_literals
from guessit import s, u
import os.path
import zipfile
import io
def split_path(path):
......@@ -84,4 +85,4 @@ def load_file_in_same_dir(ref_file, filename):
zfile = zipfile.ZipFile(zfilename)
return zfile.read('/'.join(path[i + 1:]))
return u(open(os.path.join(*path)).read())
return u(io.open(os.path.join(*path), encoding='utf-8').read())
......@@ -21,6 +21,7 @@
from __future__ import unicode_literals
from guessit import UnicodeMixin, base_text_type, u, s
from guessit.fileutils import load_file_in_same_dir
from guessit.textutils import find_words
from guessit.country import Country
import re
import logging
......@@ -317,7 +318,7 @@ def search_language(string, lang_filter=None):
'is', 'it', 'am', 'mad', 'men', 'man', 'run', 'sin', 'st', 'to',
'no', 'non', 'war', 'min', 'new', 'car', 'day', 'bad', 'bat', 'fan',
'fry', 'cop', 'zen', 'gay', 'fat', 'cherokee', 'got', 'an', 'as',
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr',
'cat', 'her', 'be', 'hat', 'sun', 'may', 'my', 'mr', 'rum', 'pi',
# french words
'bas', 'de', 'le', 'son', 'vo', 'vf', 'ne', 'ca', 'ce', 'et', 'que',
'mal', 'est', 'vol', 'or', 'mon', 'se',
......@@ -325,7 +326,7 @@ def search_language(string, lang_filter=None):
'la', 'el', 'del', 'por', 'mar',
# other
'ind', 'arw', 'ts', 'ii', 'bin', 'chan', 'ss', 'san', 'oss', 'iii',
'vi', 'ben'
'vi', 'ben', 'da'
])
sep = r'[](){} \._-+'
......@@ -334,7 +335,8 @@ def search_language(string, lang_filter=None):
slow = ' %s ' % string.lower()
confidence = 1.0 # for all of them
for lang in lng_all_names:
for lang in set(find_words(slow)) & lng_all_names:
if lang in lng_common_words:
continue
......@@ -351,7 +353,7 @@ def search_language(string, lang_filter=None):
if lang_filter and language not in lang_filter:
continue
# only allow those languages that have a 2-letter code, those who
# only allow those languages that have a 2-letter code, those that
# don't are too esoteric and probably false matches
if language.lang not in lng3_to_lng2:
continue
......@@ -364,7 +366,7 @@ def search_language(string, lang_filter=None):
else:
# Note: we could either be really confident that we found a
# language or assume that full language names are too
# common words
# common words and lower their confidence accordingly
confidence = 0.3 # going with the low-confidence route here
return language, (pos - 1, end - 1), confidence
......
......@@ -19,15 +19,16 @@
#
from __future__ import unicode_literals
from guessit import PY3, u
from guessit import PY3, u, base_text_type
from guessit.matchtree import MatchTree
from guessit.textutils import normalize_unicode
import logging
log = logging.getLogger(__name__)
class IterativeMatcher(object):
def __init__(self, filename, filetype='autodetect'):
def __init__(self, filename, filetype='autodetect', opts=None):
"""An iterative matcher tries to match different patterns that appear
in the filename.
......@@ -73,6 +74,14 @@ class IterativeMatcher(object):
raise ValueError("filetype needs to be one of %s" % valid_filetypes)
if not PY3 and not isinstance(filename, unicode):
log.warning('Given filename to matcher is not unicode...')
filename = filename.decode('utf-8')
filename = normalize_unicode(filename)
if opts is None:
opts = []
elif isinstance(opts, base_text_type):
opts = opts.split()
self.match_tree = MatchTree(filename)
mtree = self.match_tree
......@@ -112,13 +121,20 @@ class IterativeMatcher(object):
'guess_properties', 'guess_language',
'guess_video_rexps' ]
if 'nolanguage' in opts:
strategy.remove('guess_language')
for name in strategy:
apply_transfo(name)
# more guessers for both movies and episodes
for name in ['guess_bonus_features', 'guess_year', 'guess_country']:
for name in ['guess_bonus_features', 'guess_year']:
apply_transfo(name)
if 'nocountry' not in opts:
apply_transfo('guess_country')
# split into '-' separated subgroups (with required separator chars
# around the dash)
apply_transfo('split_on_dash')
......
......@@ -150,7 +150,7 @@ prop_single = { 'releaseGroup': [ 'ESiR', 'WAF', 'SEPTiC', r'\[XCT\]', 'iNT', 'P
'CtrlHD', 'POD', 'WiKi', 'DIMENSION', 'IMMERSE', 'FQM',
'2HD', 'REPTiLE', 'CTU', 'HALCYON', 'EbP', 'SiTV',
'SAiNTS', 'HDBRiSe', 'AlFleNi-TeaM', 'EVOLVE', '0TV',
'TLA', 'NTB', 'ASAP', 'MOMENTUM' ],
'TLA', 'NTB', 'ASAP', 'MOMENTUM', 'FoV', 'D-Z0N3' ],
'other': [ 'PROPER', 'REPACK', 'LIMITED', 'DualAudio', 'Audiofixed', 'R5',
'complete', 'classic', # not so sure about these ones, could appear in a title
......
......@@ -21,6 +21,8 @@
from __future__ import unicode_literals
import logging
import sys
import os, os.path
GREEN_FONT = "\x1B[0;32m"
YELLOW_FONT = "\x1B[0;33m"
......@@ -29,19 +31,26 @@ RED_FONT = "\x1B[0;31m"
RESET_FONT = "\x1B[0m"
def setupLogging(colored=True):
def setupLogging(colored=True, with_time=False, with_thread=False, filename=None):
"""Set up a nice colored logger as the main application logger."""
class SimpleFormatter(logging.Formatter):
def __init__(self):
self.fmt = '%(levelname)-8s %(module)s:%(funcName)s -- %(message)s'
def __init__(self, with_time, with_thread):
self.fmt = (('%(asctime)s ' if with_time else '') +
'%(levelname)-8s ' +
'[%(name)s:%(funcName)s]' +
('[%(threadName)s]' if with_thread else '') +
' -- %(message)s')
logging.Formatter.__init__(self, self.fmt)
class ColoredFormatter(logging.Formatter):
def __init__(self):
self.fmt = ('%(levelname)-8s ' +
BLUE_FONT + '%(mname)-8s %(mmodule)s:%(funcName)s' +
RESET_FONT + ' -- %(message)s')
def __init__(self, with_time, with_thread):
self.fmt = (('%(asctime)s ' if with_time else '') +
'-CC-%(levelname)-8s ' +
BLUE_FONT + '[%(name)s:%(funcName)s]' +
RESET_FONT + ('[%(threadName)s]' if with_thread else '') +
' -- %(message)s')
logging.Formatter.__init__(self, self.fmt)
def format(self, record):
......@@ -50,17 +59,29 @@ def setupLogging(colored=True):
record.mmodule = '.'.join(modpath[1:])
result = logging.Formatter.format(self, record)
if record.levelno == logging.DEBUG:
return BLUE_FONT + result
color = BLUE_FONT
elif record.levelno == logging.INFO:
return GREEN_FONT + result
color = GREEN_FONT
elif record.levelno == logging.WARNING:
return YELLOW_FONT + result
color = YELLOW_FONT
else:
return RED_FONT + result
color = RED_FONT
result = result.replace('-CC-', color)
return result
if filename is not None:
# make sure we can write to our log file
logdir = os.path.dirname(filename)
if not os.path.exists(logdir):
os.makedirs(logdir)
ch = logging.FileHandler(filename, mode='w')
ch.setFormatter(SimpleFormatter(with_time, with_thread))
else:
ch = logging.StreamHandler()
if colored and sys.platform != 'win32':
ch.setFormatter(ColoredFormatter())
ch.setFormatter(ColoredFormatter(with_time, with_thread))
else:
ch.setFormatter(SimpleFormatter())
ch.setFormatter(SimpleFormatter(with_time, with_thread))
logging.getLogger().addHandler(ch)
......@@ -23,9 +23,13 @@ from guessit import s
from guessit.patterns import sep
import functools
import unicodedata
import re
# string-related functions
def normalize_unicode(s):
return unicodedata.normalize('NFC', s)
def strip_brackets(s):
if not s:
......@@ -53,6 +57,13 @@ def clean_string(s):
return result
_words_rexp = re.compile('\w+', re.UNICODE)
def find_words(s):
return _words_rexp.findall(s.replace('_', ' '))
def reorder_title(title):
ltitle = title.lower()
if ltitle[-4:] == ',the':
......
......@@ -27,7 +27,7 @@ log = logging.getLogger(__name__)
# list of common words which could be interpreted as countries, but which
# are far too common to be able to say they represent a country
country_common_words = frozenset([ 'bt' ])
country_common_words = frozenset([ 'bt', 'bb' ])
def process(mtree):
for node in mtree.unidentified_leaves():
......
......@@ -22,7 +22,7 @@ from __future__ import unicode_literals
from guessit import Guess
from guessit.transfo import SingleNodeGuesser
from guessit.language import search_language
from guessit.textutils import clean_string
from guessit.textutils import clean_string, find_words
import logging
log = logging.getLogger(__name__)
......@@ -31,12 +31,6 @@ log = logging.getLogger(__name__)
def guess_language(string):
language, span, confidence = search_language(string)
if language:
# is it a subtitle language?
if 'sub' in clean_string(string[:span[0]]).lower().split(' '):
return (Guess({'subtitleLanguage': language},
confidence=confidence),
span)
else:
return (Guess({'language': language},
confidence=confidence),
span)
......@@ -46,3 +40,4 @@ def guess_language(string):
def process(mtree):
SingleNodeGuesser(guess_language, None, log).process(mtree)
# Note: 'language' is promoted to 'subtitleLanguage' in the post_process transfo
......@@ -20,6 +20,7 @@
from __future__ import unicode_literals
from guessit import Guess
import unicodedata
import logging
log = logging.getLogger(__name__)
......
......@@ -32,6 +32,19 @@ def get_patterns(property_name):
CODECS = get_patterns('videoCodec')
FORMATS = get_patterns('format')
GROUP_NAMES = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
for codec in CODECS ]
GROUP_NAMES += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
for fmt in FORMATS ]
GROUP_NAMES2 = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for codec in CODECS ]
GROUP_NAMES2 += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for fmt in FORMATS ]
GROUP_NAMES = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES ]
GROUP_NAMES2 = [ re.compile(r, re.IGNORECASE) for r in GROUP_NAMES2 ]
def adjust_metadata(md):
return dict((property_name, compute_canonical_form(property_name, value) or value)
for property_name, value in md.items())
......@@ -39,13 +52,8 @@ def adjust_metadata(md):
def guess_release_group(string):
# first try to see whether we have both a known codec and a known release group
group_names = [ r'(?P<videoCodec>' + codec + r')-?(?P<releaseGroup>.*?)[ \.]'
for codec in CODECS ]
group_names += [ r'(?P<format>' + fmt + r')-?(?P<releaseGroup>.*?)[ \.]'
for fmt in FORMATS ]
for rexp in group_names:
match = re.search(rexp, string, re.IGNORECASE)
for rexp in GROUP_NAMES:
match = rexp.search(string)
if match:
metadata = match.groupdict()
release_group = compute_canonical_form('releaseGroup', metadata['releaseGroup'])
......@@ -55,13 +63,8 @@ def guess_release_group(string):
# pick anything as releaseGroup as long as we have a codec in front
# this doesn't include a potential dash ('-') ending the release group
# eg: [...].X264-HiS@SiLUHD-English.[...]
group_names = [ r'\.(?P<videoCodec>' + codec + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for codec in CODECS ]
group_names += [ r'\.(?P<format>' + fmt + r')-(?P<releaseGroup>.*?)(-(.*?))?[ \.]'
for fmt in FORMATS ]
for rexp in group_names:
match = re.search(rexp, string, re.IGNORECASE)
for rexp in GROUP_NAMES2:
match = rexp.search(string)
if match:
return adjust_metadata(match.groupdict()), (match.start(1), match.end(2))
......
......@@ -20,7 +20,7 @@
from __future__ import unicode_literals
from guessit.patterns import subtitle_exts
from guessit.textutils import reorder_title
from guessit.textutils import reorder_title, find_words
import logging
log = logging.getLogger(__name__)
......@@ -46,6 +46,15 @@ def process(mtree):
node == mtree.leaves()[-2]):
promote_subtitle()
# - if we find the word 'sub' before the language, and in the same explicit
# group, then upgrade the language
explicit_group = mtree.node_at(node.node_idx[:2])
group_str = explicit_group.value.lower()
if ('sub' in find_words(group_str) and
0 <= group_str.find('sub') < (node.span[0] - explicit_group.span[0])):
promote_subtitle()
# - if a language is in an explicit group just preceded by "st",
# it is a subtitle language (eg: '...st[fr-eng]...')
try:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment