Phrase extraction algorithm for statistical machine translation

# -*- coding: utf-8 -*- def phrase_extraction(srctext, trgtext, alignment): """ Phrase extraction algorithm. """ def extract(f_start, f_end, e_start, e_end): phrases = set() # return { } if f end == 0 if f_end == 0: return # for all (e,f) ∈ A do for e,f in alignment: # return { } if e < e start or e > e end if e < e_start or e > e_end: return fs = f_start # repeat- while True: fe = f_end # repeat- while True: # add phrase pair ( e start .. e end , f s .. f e ) to set E trg_phrase = " ".join(trgtext[i] for i in range(fs,fe)) src_phrase = " ".join(srctext[i] for i in range(e_start,e_end)) phrases.add("\t".join([src_phrase, trg_phrase])) fe+=1 # fe++ # -until fe aligned if fe in f_aligned or fe > trglen: break fs-=1 # fe-- # -until fs aligned if fs in f_aligned or fs < 0: break return phrases # Calculate no. of tokens in source and target texts. srctext = srctext.split() trgtext = trgtext.split() srclen = len(srctext) trglen = len(trgtext) # Keeps an index of which source/target words are aligned. e_aligned = [i for i,_ in alignment] f_aligned = [j for _,j in alignment] bp = set() # set of phrase pairs BP # for e start = 1 ... length(e) do for e_start in range(srclen): # for e end = e start ... length(e) do for e_end in range(e_start, srclen): # // find the minimally matching foreign phrase # (f start , f end ) = ( length(f), 0 ) f_start, f_end = trglen, 0 # for all (e,f) ∈ A do for e,f in alignment: # if e start ≤ e ≤ e end then if e_start <= e <= e_end: f_start = min(f, f_start) f_end = max(f, f_end) # add extract (f start , f end , e start , e end ) to set BP phrases = extract(f_start, f_end, e_start, e_end) if phrases: bp.update(phrases) return bp srctext = "michael assumes that he will stay in the house" trgtext = "michael geht davon aus , dass er im haus bleibt" alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), (5,9), (6,7), (7,7), (8,8)] phrases = phrase_extraction(srctext, trgtext, alignment) for i in phrases: print i

So I have had a look at this problem and can now reproduce the desired output. Turns out there are a number of problems:

The algorithm is not quite complete. In the online version of the book (3rd printing 2012) line 4 of the extract function has been updated. (Maybe there is an Errata)
The algorithm assumes indexing starting from 1 up to and including the end.
Python assumes 0 based indexing up to but not including the end.
In particular this affects the stopping values of some calls to range and a number of comparisons.
The items in the set have been modified to allow for easier reproduction of the desired output.

Example output (matching 19 phrases with some matches repeated 5 times, in total 24 extracted):

$ python2.7 phrase_extract_new.py 
( 1) (0, 1) michael — michael
( 2) (0, 2) michael assumes — michael geht davon aus ; michael geht davon aus ,
( 3) (0, 3) michael assumes that — michael geht davon aus , dass
( 4) (0, 4) michael assumes that he — michael geht davon aus , dass er
( 5) (0, 9) michael assumes that he will stay in the house — michael geht davon aus , dass er im haus bleibt
( 6) (1, 2) assumes — geht davon aus ; geht davon aus ,
( 7) (1, 3) assumes that — geht davon aus , dass
( 8) (1, 4) assumes that he — geht davon aus , dass er
( 9) (1, 9) assumes that he will stay in the house — geht davon aus , dass er im haus bleibt
(10) (2, 3) that — dass ; , dass
(11) (2, 4) that he — dass er ; , dass er
(12) (2, 9) that he will stay in the house — dass er im haus bleibt ; , dass er im haus bleibt
(13) (3, 4) he — er
(14) (3, 9) he will stay in the house — er im haus bleibt
(15) (4, 6) will stay — bleibt
(16) (4, 9) will stay in the house — im haus bleibt
(17) (6, 8) in the — im
(18) (6, 9) in the house — im haus
(19) (8, 9) house — haus
$ python2.7 phrase_extract_new.py | grep -c ';'
5

Below follows the suggested interpretation of the algorithm. This algorithm needs to be refactored quite a bit, but before that more testing is needed with different examples. Reproducing the example in the book is just a start:

# -*- coding: utf-8 -*-

def phrase_extraction(srctext, trgtext, alignment):
    """
    Phrase extraction algorithm.
    """
    def extract(f_start, f_end, e_start, e_end):
        if f_end < 0:  # 0-based indexing.
            return {}
        # Check if alignement points are consistent.
        for e,f in alignment:
            if ((f_start <= f <= f_end) and
               (e < e_start or e > e_end)):
                return {}

        # Add phrase pairs (incl. additional unaligned f)
        # Remark:  how to interpret "additional unaligned f"?
        phrases = set()
        fs = f_start
        # repeat-
        while True:
            fe = f_end
            # repeat-
            while True:
                # add phrase pair ([e_start, e_end], [fs, fe]) to set E
                # Need to +1 in range  to include the end-point.
                src_phrase = " ".join(srctext[i] for i in range(e_start,e_end+1))
                trg_phrase = " ".join(trgtext[i] for i in range(fs,fe+1))
                # Include more data for later ordering.
                phrases.add(((e_start, e_end+1), src_phrase, trg_phrase))
                fe += 1 # fe++
                # -until fe aligned or out-of-bounds
                if fe in f_aligned or fe == trglen:
                    break
            fs -=1  # fe--
            # -until fs aligned or out-of- bounds
            if fs in f_aligned or fs < 0:
                break
        return phrases

    # Calculate no. of tokens in source and target texts.
    srctext = srctext.split()   # e
    trgtext = trgtext.split()   # f
    srclen = len(srctext)       # len(e)
    trglen = len(trgtext)       # len(f)
    # Keeps an index of which source/target words are aligned.
    e_aligned = [i for i,_ in alignment]
    f_aligned = [j for _,j in alignment]

    bp = set() # set of phrase pairs BP
    # for e start = 1 ... length(e) do
    # Index e_start from 0 to len(e) - 1
    for e_start in range(srclen):
        # for e end = e start ... length(e) do
        # Index e_end from e_start to len(e) - 1
        for e_end in range(e_start, srclen):
            # // find the minimally matching foreign phrase
            # (f start , f end ) = ( length(f), 0 )
            # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1]
            f_start, f_end = trglen-1 , -1  #  0-based indexing
            # for all (e,f) ∈ A do
            for e,f in alignment:
                # if e start ≤ e ≤ e end then
                if e_start <= e <= e_end:
                    f_start = min(f, f_start)
                    f_end = max(f, f_end)
            # add extract (f start , f end , e start , e end ) to set BP
            phrases = extract(f_start, f_end, e_start, e_end)
            if phrases:
                bp.update(phrases)
    return bp

# Refer to match matrix.
#             0      1      2   3  4     5   6   7    8
srctext = "michael assumes that he will stay in the house"
#             0      1    2    3  4  5   6  7   8     9
trgtext = "michael geht davon aus , dass er im haus bleibt"
alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), (5,9), (6,7), (7,7), (8,8)]

phrases = phrase_extraction(srctext, trgtext, alignment)

# Keep track of translations of each phrase in srctext and its
# alignement using a dictionary with keys as phrases and values being
# a list [e_alignement pair, [f_extractions, ...] ]
dlist = {}
for p, a, b in phrases:
    if a in dlist:
        dlist[a][1].append(b)
    else:
        dlist[a] = [p, [b]]

# Sort the list of translations based on their length.  Shorter phrases first.
for v in dlist.values():
    v[1].sort(key=lambda x: len(x))


# Function to help sort according to book example.
def ordering(p):
    k,v = p
    return v[0]
#
for i, p in enumerate(sorted(dlist.items(), key = ordering), 1):
    k, v = p
    print "({0:2}) {1} {2} — {3}".format( i, v[0], k, " ; ".join(v[1]))

The final part where the output is prepared can probably be improved...and the algorithm code can certaily be made more Pythonic.

Recommended topics

Hot tags