Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5fcc151

Browse files
Merge pull request#38 from smoke-b/master3
rggu improvements
2 parentsc7aa509 +4eca0fd commit5fcc151

File tree

3 files changed

+116
-41
lines changed

3 files changed

+116
-41
lines changed

‎.travis.yml‎

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
language:python
22
python:
3-
-"2.6"
43
-"2.7"
5-
-"3.3"
64
-"3.4"
75
-"3.6"
6+
-"3.7"
87
-"pypy"
98
install:
109
-pip install -r requirements-dev.txt

‎pymystem3/mystem.py‎

Lines changed: 114 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@
3333

3434
_TARBALL_URLS= {
3535
'linux': {
36+
'32bit':"http://download.cdn.yandex.net/mystem/mystem-3.0-linux3.5-32bit.tar.gz",
3637
'64bit':"http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz",
3738
},
3839
'darwin':"http://download.cdn.yandex.net/mystem/mystem-3.1-macosx.tar.gz",
3940
'win': {
41+
'32bit':"http://download.cdn.yandex.net/mystem/mystem-3.0-win7-32bit.zip",
4042
'64bit':"http://download.cdn.yandex.net/mystem/mystem-3.1-win-64bit.zip",
4143
},
4244
}
@@ -154,19 +156,25 @@ class Mystem(object):
154156
155157
:param mystem_bin: path to mystem binary
156158
:type mystem_bin: str
157-
:param grammar_info:glue grammatical informationfor same lemmas in output.
159+
:param grammar_info:print grammatical information(-i)
158160
:type grammar_info: bool
159-
:param disambiguation: apply disambiguation
161+
:param disambiguation: apply disambiguation (-d)
160162
:type disambiguation: bool
161-
:param entire_input: copy entire input to output
163+
:param entire_input: copy entire input to output (-c)
162164
:type entire_input: bool
163-
:param weight: print context-independent lemma weight
165+
:param glue_grammar_info: glue grammatical information for same lemmas in output (works only with grammar_info=True) (-g)
166+
:type glue_grammar_info: bool
167+
:param weight: print context-independent lemma weight (--weight)
164168
:type weight: bool
165-
:param generate_all: generate all possible hypotheses
169+
:param generate_all: generate all possible hypotheses for non-dictionary words (--generate-all)
166170
:type generate_all: bool
167-
:param fixlist: path to a custom dictionary to use for analysis
171+
:param no_bastards: print only dictionary words (-w)
172+
:type no_bastards: bool
173+
:param end_of_sentence: print end of sentence mark (works only with entire_input=True) (-s)
174+
:type end_of_sentence: bool
175+
:param fixlist: path to a custom dictionary to use for analysis (--fixlist)
168176
:type fixlist: str
169-
:param use_english_names: english names of grammemes
177+
:param use_english_names: english names of grammemes (--eng-gr)
170178
:type use_english_names: bool
171179
172180
.. note:: Default value of :py:attr:`mystem_bin` can be overwritted by :envvar:`MYSTEM_BIN`.
@@ -178,19 +186,27 @@ def __init__(
178186
grammar_info=True,
179187
disambiguation=True,
180188
entire_input=True,
181-
weight=True,
189+
glue_grammar_info=True,
190+
weight=False,
182191
generate_all=False,
192+
no_bastards=False,
193+
end_of_sentence=False,
183194
fixlist=None,
184195
use_english_names=False
185196
):
186197
self._mystem_bin=mystem_bin
187198
self._grammar_info=grammar_info
188199
self._disambiguation=disambiguation
189200
self._entire_input=entire_input
201+
self._glue_grammar_info=glue_grammar_info
190202
self._weight=weight
191203
self._generate_all=generate_all
204+
self._no_bastards=no_bastards
205+
self._end_of_sentence=end_of_sentence
192206
self._fixlist=fixlist
193207
self._use_english_names=use_english_names
208+
209+
self._file_path=""
194210
self._procin=None
195211
self._procout=None
196212
self._procout_no=None
@@ -205,26 +221,32 @@ def __init__(
205221

206222
self._mystemargs= ["--format","json"]
207223

208-
ifself._grammar_infoisTrue:
209-
self._mystemargs.append('-gi')
224+
ifself._grammar_info:
225+
self._mystemargs.append('-i')
226+
ifself._glue_grammar_info:
227+
self._mystemargs.append('-g')
210228

211-
ifself._disambiguationisTrue:
229+
ifself._disambiguation:
212230
self._mystemargs.append('-d')
213231

214-
ifself._entire_inputisTrue:
232+
ifself._entire_input:
215233
self._mystemargs.append('-c')
234+
ifself._no_bastards:
235+
self._mystemargs.append('-w')
236+
ifself._end_of_sentence:
237+
self._mystemargs.append('-s')
216238

217-
ifself._weightisTrue:
239+
ifself._weight:
218240
self._mystemargs.append('--weight')
219241

220-
ifself._generate_allisTrue:
242+
ifself._generate_all:
221243
self._mystemargs.append('--generate-all')
222244

223245
ifself._fixlistisnotNone:
224246
self._mystemargs.append('--fixlist')
225247
self._mystemargs.append(self._fixlist)
226248

227-
ifself._use_english_namesisTrue:
249+
ifself._use_english_names:
228250
self._mystemargs.append('--eng-gr')
229251

230252
def__del__(self):
@@ -260,7 +282,10 @@ def close(self):
260282
self._proc=None
261283

262284
def_start_mystem(self):
263-
self._proc=subprocess.Popen([self._mystem_bin]+self._mystemargs,
285+
Mystem_args= [self._mystem_bin]+self._mystemargs
286+
ifself._file_path:
287+
Mystem_args.append(self._file_path)
288+
self._proc=subprocess.Popen(Mystem_args,
264289
stdin=subprocess.PIPE,
265290
stdout=subprocess.PIPE,
266291
bufsize=0,
@@ -270,39 +295,51 @@ def _start_mystem(self):
270295
self._procout_no=self._procout.fileno()
271296
_set_non_blocking(self._procout)
272297

273-
defanalyze(self,text):
298+
defanalyze(self,text='',file_path=None):
274299
"""
275300
Make morphology analysis for a text.
276301
277-
:param text: text to analyze
278302
:type text: str
303+
:param text: text to analyze
304+
:type file_path: str
305+
:param file_path: alternative mode: if defined, file_path will be used to open utf8 text file for analysis.
306+
Argument text is not used in this case.
279307
:returns: result of morphology analysis.
280308
:rtype: dict
281309
"""
282310

283311
result= []
284-
forlineintext.splitlines():
285-
try:
286-
result.extend(self._analyze_impl(line))
287-
exceptbroken_pipe:
288-
self.close()
289-
self.start()
290-
result.extend(self._analyze_impl(line))
312+
self._file_path=file_path
313+
314+
ifself._file_path:
315+
# file path will be used and passed to mystem.exe
316+
result.extend(self._analyze_impl(''))
317+
else:
318+
forlineintext.splitlines():
319+
try:
320+
result.extend(self._analyze_impl(line))
321+
exceptbroken_pipe:
322+
self.close()
323+
self.start()
324+
result.extend(self._analyze_impl(line))
291325
returnresult
292326

293-
deflemmatize(self,text):
327+
deflemmatize(self,text='',file_path=None):
294328
"""
295329
Make morphology analysis for a text and return list of lemmas.
296330
297-
:param text: text to analyze
298331
:type text: str
332+
:param text: text to analyze
333+
:type file_path: str
334+
:param file_path: alternative mode: if defined, file_path will be used to open utf8 text file for analysis.
335+
Argument text is not used in this case.
299336
:returns: list of lemmas
300337
:rtype: list
301338
"""
302339

303340
need_encode= (sys.version_info[0]<3andisinstance(text,str))
304341

305-
infos=self.analyze(text)
342+
infos=self.analyze(text,file_path=file_path)
306343
lemmas=list(ifilter(None,imap(self._get_lemma,infos)))
307344

308345
ifneed_encodeisTrue:
@@ -318,9 +355,10 @@ def _analyze_impl(self, text):
318355
ifself._procisNone:
319356
self._start_mystem()
320357

321-
self._procin.write(text)
322-
self._procin.write(_NL)
323-
self._procin.flush()
358+
ifnotself._file_path:
359+
self._procin.write(text)
360+
self._procin.write(_NL)
361+
self._procin.flush()
324362

325363
sio=StringIO()
326364
out=None
@@ -330,13 +368,14 @@ def _analyze_impl(self, text):
330368
try:
331369
out=self._procout.read()
332370
sio.write(out)
333-
obj=json.loads(sio.getvalue().decode('utf-8'))
371+
out=sio.getvalue().decode('utf-8')
372+
obj=self._process_json_output(out)
334373
break
335374
except (IOError,ValueError):
336375
rd,_,_=select.select([self._procout_no], [], [],30)
337376
ifself._procout_nonotinrd:
338377
raiseRuntimeError("Problem has been occured. Current state:\ntext:\n%r\nout:\n%r\nsio:\n%r"%
339-
(text,out,sio.getvalue()))
378+
(text[0:2000],out[0:2000],sio.getvalue()))
340379

341380
returnobj
342381
else:
@@ -347,17 +386,18 @@ def _analyze_impl(self, text):
347386
ifself._procisNone:
348387
self._start_mystem()
349388

350-
self._procin.write(text)
351-
self._procin.write(_NL)
389+
ifnotself._file_path:
390+
self._procin.write(text)
391+
self._procin.write(_NL)
352392

353393
out,_=self._proc.communicate()
354394
self._proc=None
355395
try:
356-
#obj =json.loads(out)
357-
obj=json.loads(out.decode('utf-8'))
396+
out=out.decode('utf-8')
397+
obj=self._process_json_output(out)
358398
except (IOError,ValueError):
359399
raiseRuntimeError("Problem has been occured. Current state:\ntext:\n%r\nout:\n%r"%
360-
(text,out))
400+
(text[0:2000],out[0:2000]))
361401

362402
returnobj
363403

@@ -367,3 +407,39 @@ def _get_lemma(o):
367407
returno['analysis'][0]['lex']
368408
except (KeyError,IndexError):
369409
returno['text']if'text'inoelseNone
410+
411+
@staticmethod
412+
defget_pos(token):
413+
""" Get main part-of-speech tag for token. """
414+
analysis=token.get('analysis')
415+
ifnotanalysis:
416+
returnNone
417+
418+
gr=analysis[0].get('gr','')
419+
returngr.split('=')[0].split(',')[0]
420+
421+
@staticmethod
422+
def_process_json_output(out):
423+
"""
424+
Delete all empty lines and join json output into one line
425+
Line breaks occur if the file path goes to the analysis function (file_path parameter is used)
426+
"""
427+
obj= []
428+
forlineinout.split('\n'):# really, on windows separator is '\r\n', but that is not a problem
429+
ifline:
430+
obj.extend(json.loads(line))
431+
returnobj
432+
433+
@staticmethod
434+
defget_printable_repr(token):
435+
""" Get string with results of Mystem parsing for token in human readable representation. """
436+
437+
if'analysis'notintoken:
438+
return'sep: '+repr(token['text'])
439+
440+
hypotheses=token['analysis']
441+
s='lex: {:13} [{}]:'.format(repr(token['text']),len(hypotheses))
442+
variants= (' {}:{}:{}:{}'.format(h['lex'],h.get('qual',''),h.get('wt',''),h.get('gr',''))forhinhypotheses)
443+
tab=len(s)
444+
s+= ('\n'+' '*tab).join(variants)
445+
returns

‎setup.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def _lint():
165165
project_python_files= [filenameforfilenameinget_project_files()
166166
iffilename.endswith(b'.py')]
167167
retcode=subprocess.call(
168-
['flake8','--max-complexity=12','--ignore=E265','--max-line-length=120']+project_python_files)
168+
['flake8','--max-complexity=15','--ignore=E265','--max-line-length=140']+project_python_files)
169169
ifretcode==0:
170170
print_success_message('No style errors')
171171
returnretcode

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp