ورود

**Mehrbod** · 05-03-2013, 02:18 PM

نرمافزار PârsikNemâye PeyNevis یا PN² یک برنامه برای شناسایی خودکار واژگان پارسیک و برچسب زدن آنها میباشد.

نرم‌افزار به زبان Python نوشته شده و از این فندآوری‌ها[sup][aname=rpaf905][[/aname][anchor=paf905]1][/anchor][/sup] میبهرد[sup][aname=rpac0b1][[/aname][anchor=pac0b1]2][/anchor][/sup]:

١. Python Programming Language Official Website
٢. MongoDB

از آنجاییکه port کردن برنامه برای همگان کارِ دشوار و زمانبری است, چندتایی از راهکارهایی که پیشتر در آورده‌ام را اینجا میگذارم, شاید به کار دیگران بیایند.

unipers/mapped_chars.ini

کد:
آ    â

ا    a

ب    b

پ    p

ت    t

ث    s

ج    j

چ    c

ح    h

خ    x

د    d

ذ    z

ر    r

ز    z

ژ    ž

س    s

ش    š

ص    s

ض    z

ط    t

ظ    z

ع    å

غ    q

ف    f

ق    q

ک    k

گ    g

ل    l

م    m

ن    n

#و    ů

و    v

ه    h

ی    y

ء    $

آ    â

اً    $

هٔ    $

ة    $

ٸ    '

ترانویس‌ها (هیچکدام ١٠٠% درست کار نمیکند)

کد:
_unimapped     =   {}

with open('unipers/mapped_chars.ini','r',encoding='utf-8') as file:

  lines = sorted(file.read().split('\n'), key=lambda l: l.split('\t')[0])

for line in lines:

    if line.count('\t') == 0 or line.startswith('#'): continue

    a, b = line.split('\t')[1], line.split('\t')[0]

    if a in _unimapped: continue

    _unimapped[a] = b

_unimapped['i'] = 'ی'

_unimapped['u'] = 'و'; _unimapped['v'] = 'و'; _unimapped['w'] = 'و'

_unimapped[' '] = ' '

_unimapped.update({c:c for c in string.punctuation})

def stripvowels(word):

  return re.sub('(ِ|ُ|َ|ْ|ّ)+', '', word)

def unipers2roman(word, reverse=False):

  mapped = bidict({'x':'kh', 'ž':'zh', 'š':'sh', 'c':'ch'})

  for k, v in (mapped.items() if not reverse else mapped.inv.items()):

    word = word.replace(k, v)

  return word

def unipers2perso(word, vowels=False, hint=None):

  if len(word) == 0: return ''

  ret = ''

  _vowels = {'e':'ِ', 'o':'ُ', 'a':'َ'}

  mfaced = 't z s h'.split()

  mfaces = {

    't':'ت ط'.split(),

    'z':'ز ذ ض ظ'.split(),

    's':'س ص ث'.split(),

    'h':'ه ح'.split(),

    }

  idx = 0

  for idx, c in enumerate(word.lower()):

    if idx < len(word) -1 and c == 'i' and word[idx+1] == 'y':

      continue

    if idx == 0 and c in ['i']: ret += 'ای'

    elif idx == 0 and c in ['u']: ret += 'او'

    elif idx == 0 and c in ['e', 'o', 'a']: ret += 'ا'

    elif c in ['e', 'o', 'a'] and idx > 0:

      if c in ['a'] and (word[idx-3:idx] in ['pas'] or word[idx-1] in ['i']): ret += 'ا'

      elif c in ['o'] and re.match('o(.udan|dâ)', word[idx:]): pass

      elif c in ['o'] and re.match('o([z])?ir', word[idx:]): pass

      elif c in ['o'] and idx < len(word)-2 and word[idx+1] in ['â','g','k','l','d', 'b', 'z']: ret += 'و'

      if vowels: ret += _vowels[c]

    else:

      if c in mfaced:

        if hint and idx < len(hint):

          l = mfaces[c]

          indices = sorted(filter(lambda x: x[0] > -1, ((hint.find(ch, idx), ch,) for ch in l)), key=lambda x: x[0])

          if indices:

            idx = indices[0][0]

            ret += indices[0][1] if idx < len(hint) else l[0]

          else:

            ret += l[0]

        else:

          ret += mfaces[c][0]

      elif c in ['e']: ret += 'ع'

      else: ret += _unimapped[c]

    if idx == len(word)-1 and c in ['a', 'e']: ret += 'ه'

    elif c in ['â']:

      if idx < len(word)-2 and word[idx+1] == 'i': ret += 'ی'

    elif c in ['e'] and not re.search('(.{1,3})(udan|â|uxtan|uz|stan)', word[idx:]):

      if idx == len(word)-1: ret += 'ه'

      elif idx < len(word) -1:

        if word[idx+1] in ['a', 'â', 's', 'p']: ret += 'ه' + '\u200c'

        elif word[idx+1] in ['i']: ret += 'ه' + '\u200cا'

  ret = ret[0] + ret[1:].replace('آ','ا')

  for c in ret:

    if c in ['ی','ن','و']: continue

    ret = re.sub('%s+' % re.escape(c),c, ret)

  return ret

گرفتن بن کنون از کارواژه[sup][aname=rpa5f5c][[/aname][anchor=pa5f5c]3][/anchor][/sup]:

augment_verb_exceptions.ini:

کد:
#Exception_Forms

zistan ziv

bihudan bihun

کد:
v_exceptions = dict(verb.split() for verb in filter(None, fread('augment_verb_exceptions.ini').splitlines()[1:]))

def konundis(verb):

  "Âhanješe bone konun az kârvâže"

  global v_exceptions

  prefix = ""

  if isinstance(verb, Word):

    if verb.roots:

      if verb.roots_into_verb:

        verb = ''.join(verb.roots[verb.roots_into_verb:]).lower()

      else:

        prefix = ''.join(verb.roots[:-1]).lower()

        verb = verb.roots[-1].lower()

      if verb in ['idan', 'dan']: return prefix

    else: verb = verb.text

  ret = jahanshiri((prefix+verb).lower())

  if len(ret) > 0: prefix = ''

  else:

    ret = jahanshiri(verb.lower())

  if len(ret) > 0: return prefix + ret

  else: ret = None

  if verb in v_exceptions:

    return prefix+v_exceptions[verb]

  mapped = [

    ['(.*?)oftan', '%sub'],

    ['(.*?)aftan', '%sâv'], ['(.*?)âftan', '%sâb'], ['(.*?)eftan', '%sev'], ['(.*?)ftan', '%sb'],

    ['(.*?)eštan', '%sis'],

    ['(.*?)aštan', '%sard'],

    ['(.*?)(štan|štân)', '%sr'],

    ['(.{1,3})astan', '%san'],

    ['(.*?)xtan', '%sz'],

    ['(.*?)udan', '%sâ'],

    ['(.*?)ostan', '%su'],

    ['(.{1,2})idan', '%sin'],

    ['(.{1,2})stan', '%ss'],

    ['(.*?)(adan|yidan|idan|âdan|dan|estan|stan|tan)', '%s'],

  ]

  for dis, fmt in mapped:

    if re.match(dis, verb):

      ret = fmt % re.match(dis, verb).groups()[0]

      break

  assert ret, "Kârvâžeye nâšenâxte: '%s'" % verb

  return prefix + ret

def gozaštedis(verb):

  "Âhanješe bone gozašte az kârvâže"

  return verb[:-2]

def jahanshiri(verb):

  "Gereftane konundis az jahanshiri.com"

  ret = ''

  url = 'http://www.jahanshiri.ir/pvc/conjpl.php?verb=%s&lang=en'

  tries = 0

  while True:

    try:

      page = tor.download(url % unipers2roman(verb).replace('â', '%C3%A2'))[0]

      break

    except Exception as ex:

      if tries > 3: break

      tries += 1

  assert page, "Download failed for '%s'" % verb

  if page.find('present stem') == -1: return ret

  ret = re.search('[/td][/tr][tr](?:[td].*?[/td]){2}[td](.*?)[/td]', page).groups()[0]

  ret = unipers2roman(ret, reverse=True)

  return ret

----
[aname=paf905]1[/aname]. [anchor=rpaf905]^[/anchor] fand+âvar+i::Fandâvari || فنداوری: تکنولوژی Ϣiki-En technology
[aname=pac0b1]2[/aname]. [anchor=rpac0b1]^[/anchor] Bahridan || بهریدن: بهره جستن; استفاده کردن Ϣiki-En to utilize; to use
[aname=pa5f5c]3[/aname]. [anchor=rpa5f5c]^[/anchor] kâr+vâže::Kârvâže || کارواژه: فعل Dehxodâ verb

ورود
نام‌کاربری / رایانامه
رمزعبور:	فراموشی رمزعبور
	مرا به‌خاطر بسپار.