3333
3434_TARBALL_URLS = {
3535'linux' : {
36+ '32bit' :"http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.5-32bit.tar.gz" ,
3637'64bit' :"http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz" ,
3738 },
3839'darwin' :"http://download.cdn.yandex.net/mystem/mystem-3.1-macosx.tar.gz" ,
3940'win' : {
41+ '32bit' :"http://download.cdn.yandex.net/mystem/mystem-3.0-win7-32bit.zip" ,
4042'64bit' :"http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip" ,
4143 },
4244}
@@ -154,19 +156,25 @@ class Mystem(object):
154156
155157 :param mystem_bin: path to mystem binary
156158 :type mystem_bin: str
157- :param grammar_info:glue grammatical informationfor same lemmas in output.
159+ :param grammar_info:print grammatical information(-i)
158160 :type grammar_info: bool
159- :param disambiguation: apply disambiguation
161+ :param disambiguation: apply disambiguation (-d)
160162 :type disambiguation: bool
161- :param entire_input: copy entire input to output
163+ :param entire_input: copy entire input to output (-c)
162164 :type entire_input: bool
163- :param weight: print context-independent lemma weight
165+ :param glue_grammar_info: glue grammatical information for same lemmas in output (works only with grammar_info=True) (-g)
166+ :type glue_grammar_info: bool
167+ :param weight: print context-independent lemma weight (--weight)
164168 :type weight: bool
165- :param generate_all: generate all possible hypotheses
169+ :param generate_all: generate all possible hypotheses for non-dictionary words (--generate-all)
166170 :type generate_all: bool
167- :param fixlist: path to a custom dictionary to use for analysis
171+ :param no_bastards: print only dictionary words (-w)
172+ :type no_bastards: bool
173+ :param end_of_sentence: print end of sentence mark (works only with entire_input=True) (-s)
174+ :type end_of_sentence: bool
175+ :param fixlist: path to a custom dictionary to use for analysis (--fixlist)
168176 :type fixlist: str
169- :param use_english_names: english names of grammemes
177+ :param use_english_names: english names of grammemes (--eng-gr)
170178 :type use_english_names: bool
171179
172180 .. note:: Default value of :py:attr:`mystem_bin` can be overwritted by :envvar:`MYSTEM_BIN`.
@@ -178,19 +186,27 @@ def __init__(
178186grammar_info = True ,
179187disambiguation = True ,
180188entire_input = True ,
181- weight = True ,
189+ glue_grammar_info = True ,
190+ weight = False ,
182191generate_all = False ,
192+ no_bastards = False ,
193+ end_of_sentence = False ,
183194fixlist = None ,
184195use_english_names = False
185196 ):
186197self ._mystem_bin = mystem_bin
187198self ._grammar_info = grammar_info
188199self ._disambiguation = disambiguation
189200self ._entire_input = entire_input
201+ self ._glue_grammar_info = glue_grammar_info
190202self ._weight = weight
191203self ._generate_all = generate_all
204+ self ._no_bastards = no_bastards
205+ self ._end_of_sentence = end_of_sentence
192206self ._fixlist = fixlist
193207self ._use_english_names = use_english_names
208+
209+ self ._file_path = ""
194210self ._procin = None
195211self ._procout = None
196212self ._procout_no = None
@@ -205,26 +221,32 @@ def __init__(
205221
206222self ._mystemargs = ["--format" ,"json" ]
207223
208- if self ._grammar_info is True :
209- self ._mystemargs .append ('-gi' )
224+ if self ._grammar_info :
225+ self ._mystemargs .append ('-i' )
226+ if self ._glue_grammar_info :
227+ self ._mystemargs .append ('-g' )
210228
211- if self ._disambiguation is True :
229+ if self ._disambiguation :
212230self ._mystemargs .append ('-d' )
213231
214- if self ._entire_input is True :
232+ if self ._entire_input :
215233self ._mystemargs .append ('-c' )
234+ if self ._no_bastards :
235+ self ._mystemargs .append ('-w' )
236+ if self ._end_of_sentence :
237+ self ._mystemargs .append ('-s' )
216238
217- if self ._weight is True :
239+ if self ._weight :
218240self ._mystemargs .append ('--weight' )
219241
220- if self ._generate_all is True :
242+ if self ._generate_all :
221243self ._mystemargs .append ('--generate-all' )
222244
223245if self ._fixlist is not None :
224246self ._mystemargs .append ('--fixlist' )
225247self ._mystemargs .append (self ._fixlist )
226248
227- if self ._use_english_names is True :
249+ if self ._use_english_names :
228250self ._mystemargs .append ('--eng-gr' )
229251
230252def __del__ (self ):
@@ -260,7 +282,10 @@ def close(self):
260282self ._proc = None
261283
262284def _start_mystem (self ):
263- self ._proc = subprocess .Popen ([self ._mystem_bin ]+ self ._mystemargs ,
285+ Mystem_args = [self ._mystem_bin ]+ self ._mystemargs
286+ if self ._file_path :
287+ Mystem_args .append (self ._file_path )
288+ self ._proc = subprocess .Popen (Mystem_args ,
264289stdin = subprocess .PIPE ,
265290stdout = subprocess .PIPE ,
266291bufsize = 0 ,
@@ -270,39 +295,51 @@ def _start_mystem(self):
270295self ._procout_no = self ._procout .fileno ()
271296_set_non_blocking (self ._procout )
272297
273- def analyze (self ,text ):
298+ def analyze (self ,text = '' , file_path = None ):
274299"""
275300 Make morphology analysis for a text.
276301
277- :param text: text to analyze
278302 :type text: str
303+ :param text: text to analyze
304+ :type file_path: str
305+ :param file_path: alternative mode: if defined, file_path will be used to open utf8 text file for analysis.
306+ Argument text is not used in this case.
279307 :returns: result of morphology analysis.
280308 :rtype: dict
281309 """
282310
283311result = []
284- for line in text .splitlines ():
285- try :
286- result .extend (self ._analyze_impl (line ))
287- except broken_pipe :
288- self .close ()
289- self .start ()
290- result .extend (self ._analyze_impl (line ))
312+ self ._file_path = file_path
313+
314+ if self ._file_path :
315+ # file path will be used and passed to mystem.exe
316+ result .extend (self ._analyze_impl ('' ))
317+ else :
318+ for line in text .splitlines ():
319+ try :
320+ result .extend (self ._analyze_impl (line ))
321+ except broken_pipe :
322+ self .close ()
323+ self .start ()
324+ result .extend (self ._analyze_impl (line ))
291325return result
292326
293- def lemmatize (self ,text ):
327+ def lemmatize (self ,text = '' , file_path = None ):
294328"""
295329 Make morphology analysis for a text and return list of lemmas.
296330
297- :param text: text to analyze
298331 :type text: str
332+ :param text: text to analyze
333+ :type file_path: str
334+ :param file_path: alternative mode: if defined, file_path will be used to open utf8 text file for analysis.
335+ Argument text is not used in this case.
299336 :returns: list of lemmas
300337 :rtype: list
301338 """
302339
303340need_encode = (sys .version_info [0 ]< 3 and isinstance (text ,str ))
304341
305- infos = self .analyze (text )
342+ infos = self .analyze (text , file_path = file_path )
306343lemmas = list (ifilter (None ,imap (self ._get_lemma ,infos )))
307344
308345if need_encode is True :
@@ -318,9 +355,10 @@ def _analyze_impl(self, text):
318355if self ._proc is None :
319356self ._start_mystem ()
320357
321- self ._procin .write (text )
322- self ._procin .write (_NL )
323- self ._procin .flush ()
358+ if not self ._file_path :
359+ self ._procin .write (text )
360+ self ._procin .write (_NL )
361+ self ._procin .flush ()
324362
325363sio = StringIO ()
326364out = None
@@ -330,13 +368,14 @@ def _analyze_impl(self, text):
330368try :
331369out = self ._procout .read ()
332370sio .write (out )
333- obj = json .loads (sio .getvalue ().decode ('utf-8' ))
371+ out = sio .getvalue ().decode ('utf-8' )
372+ obj = self ._process_json_output (out )
334373break
335374except (IOError ,ValueError ):
336375rd ,_ ,_ = select .select ([self ._procout_no ], [], [],30 )
337376if self ._procout_no not in rd :
338377raise RuntimeError ("Problem has been occured. Current state:\n text:\n %r\n out:\n %r\n sio:\n %r" %
339- (text ,out ,sio .getvalue ()))
378+ (text [ 0 : 2000 ] ,out [ 0 : 2000 ] ,sio .getvalue ()))
340379
341380return obj
342381else :
@@ -347,17 +386,18 @@ def _analyze_impl(self, text):
347386if self ._proc is None :
348387self ._start_mystem ()
349388
350- self ._procin .write (text )
351- self ._procin .write (_NL )
389+ if not self ._file_path :
390+ self ._procin .write (text )
391+ self ._procin .write (_NL )
352392
353393out ,_ = self ._proc .communicate ()
354394self ._proc = None
355395try :
356- #obj =json.loads(out )
357- obj = json . loads (out . decode ( 'utf-8' ) )
396+ out = out . decode ( 'utf-8' )
397+ obj = self . _process_json_output (out )
358398except (IOError ,ValueError ):
359399raise RuntimeError ("Problem has been occured. Current state:\n text:\n %r\n out:\n %r" %
360- (text ,out ))
400+ (text [ 0 : 2000 ] ,out [ 0 : 2000 ] ))
361401
362402return obj
363403
@@ -367,3 +407,39 @@ def _get_lemma(o):
367407return o ['analysis' ][0 ]['lex' ]
368408except (KeyError ,IndexError ):
369409return o ['text' ]if 'text' in o else None
410+
411+ @staticmethod
412+ def get_pos (token ):
413+ """ Get main part-of-speech tag for token. """
414+ analysis = token .get ('analysis' )
415+ if not analysis :
416+ return None
417+
418+ gr = analysis [0 ].get ('gr' ,'' )
419+ return gr .split ('=' )[0 ].split (',' )[0 ]
420+
421+ @staticmethod
422+ def _process_json_output (out ):
423+ """
424+ Delete all empty lines and join json output into one line
425+ Line breaks occur if the file path goes to the analysis function (file_path parameter is used)
426+ """
427+ obj = []
428+ for line in out .split ('\n ' ):# really, on windows separator is '\r\n', but that is not a problem
429+ if line :
430+ obj .extend (json .loads (line ))
431+ return obj
432+
433+ @staticmethod
434+ def get_printable_repr (token ):
435+ """ Get string with results of Mystem parsing for token in human readable representation. """
436+
437+ if 'analysis' not in token :
438+ return 'sep: ' + repr (token ['text' ])
439+
440+ hypotheses = token ['analysis' ]
441+ s = 'lex: {:13} [{}]:' .format (repr (token ['text' ]),len (hypotheses ))
442+ variants = (' {}:{}:{}:{}' .format (h ['lex' ],h .get ('qual' ,'' ),h .get ('wt' ,'' ),h .get ('gr' ,'' ))for h in hypotheses )
443+ tab = len (s )
444+ s += ('\n ' + ' ' * tab ).join (variants )
445+ return s