dilos/tdesdk/debian/desktop-i18n/msgsplit


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

#! /usr/bin/env python

import sys, string, codecs, os

# TODO: currently the 78 chars are *without* the quotes, while for Gettext it is *with* the quotes
# FIXME: it seems possible to get lines bigger than 80 characters.
max_length = 78

wrap_before = ['<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>', '<p>', '<br>', '<br/>', 
	       '<ol>', '<ul>', '<li>', '<table>', '<th>', '<tr>', '<td>', '<center>', 
	       '<blockquote>', '<pre>', '<hr>', '<hr/>']

### TODO: try to support any charset, not only UTF-8 (so that it can be used outside TDE)

def splitit( start, message, outfile ):
	# print start+"\""+message+"\"" # DEBUG
	if len(start):
		if len(message) + len(start) < max_length and \
			   string.find(message, '\\n') == -1:
			outstr = '%s"%s"\n' % (start, message)
			outfile.write(outstr.encode('utf-8'))
			return
		outfile.write(start)
		outfile.write(u'""\n')
		index = 0
		mlen = len(message)
		last_brace = 0
		last_space = 0
		last_comma = 0
		while index < mlen:
			if message[index] == r'n' and (index > 0 and message[index-1] == '\\') \
			   and (index < 2 or message[index-2] != '\\'):
				outstr = '"%s"\n' % message[:index+1]
				outfile.write(outstr.encode('utf-8'))
				message = message[index+1:]
				mlen -= index + 1
				index = 0
				last_brace = 0
				last_space = 0
				last_comma = 0
				continue
			elif message[index] == u'>':
				last_brace = index
			elif message[index] == u' ':
				last_space = index
			elif message[index] == u',':
				last_comma = index
			elif message[index] == u'<':
				for s in wrap_before:
					if index > 0 and message[index:].startswith(s):
						outstr = '"%s"\n' % message[:index]
						outfile.write(outstr.encode('utf-8'))
						message = message[index:]
						mlen -= index
						index = 0
						last_brace = 0
						last_space = 0
						last_comma = 0
						continue
			if index > max_length:
				if last_brace > 50:
					index = last_brace
					while index < mlen - 1 and message[index+1] == ' ':
						index += 1
				elif last_space != 0:
					index = last_space
				elif last_comma != 0:
					index = last_comma
				else:
					while index > 0 and message[index] == u'\\':
						index = index - 1
				outstr = '"%s"\n' % message[:index+1]
				outfile.write(outstr.encode('utf-8'))
				message = message[index+1:]
				mlen -= index + 1
				index = 0
				last_brace = 0
				last_space = 0
				last_comma = 0
				continue
			index += 1
		if len(message):
			outstr = '"%s"\n' % message
			outfile.write(outstr.encode('utf-8'))

if sys.hexversion >= 0x02030000:
	# We have Python 2.3 or better
	open_type="rU" # Open for read with "Universal Newline Support"
else:
	# We have a Python older than 2.3
	open_type="r" # Normal open for read
### TODO: even in the case of a parse error, the script could try to process the next file(s) instead of exiting.
for file in sys.argv[1:]:
	orig_file = open(file, open_type)
	new_file = open(file + ".new", 'w')
	
	last=''
	start=''
	index=0
	line=' '
	while 1: # python 2.1 has no True ;)
		line = orig_file.readline()
		index += 1
		if not line:
			break
		if line == '\n' or line[0] == '#':
			splitit(start, last, new_file)
			start = ''
			last = ''
			new_file.write(line)
			continue
		try:
			line = string.strip(unicode(line, 'utf-8'))
		except UnicodeError:
			print file
		if line[0] == '"' and line[-1:] == '"':
			last += line[1:-1]
			continue
		# new message
		splitit(start, last, new_file)
		if line.startswith("msgid "):
			start = "msgid "
			last = string.lstrip(line[6:-1])[1:]
		elif line.startswith("msgstr "):
			start = "msgstr "
			last = string.lstrip(line[7:-1])[1:]
		elif line.startswith("msgctxt "):
			start = "msgctxt "
			last = string.lstrip(line[8:-1])[1:]
		elif line.startswith("msgid_plural "):
			start = "msgid_plural "
			last = string.lstrip(line[13:-1])[1:]
		elif line.startswith("msgstr["):
			# For most languages, there will be only one digit
			if line[8] == "]" and line[9] == " ":
				if line[7].isdigit():
					start = line[:10]
					last = string.lstrip(line[10:-1])[1:]
				else:
					print file, "not-a-digit error for mgstr[] in line", index
					orig_file.close()
					new_file.close()
					sys.exit(1)
			else:
				posdigit = 7 # The first digit is at position 7
				while line[posdigit].isdigit():
					posdigit += 1
				if posdigit > 7 and line[posdigit] == "]" and line[posdigit+1] == " ":
					posdigit += 2 # skip ] and the space
					start = line[:posdigit]
					last = string.lstrip(line[posdigit:-1])[1:]
				else:
					print file, "parse error after msgstr[ in line", index
					orig_file.close()
					new_file.close()
					sys.exit(1)
		else:
			print file, "parsing error in line", index
			orig_file.close()
			new_file.close()
			sys.exit(1)

	splitit(start, last, new_file)
	orig_file.close()
	new_file.close()
	os.rename(file + ".new", file)

# kate:  space-indent off; indent-width 8; replace-tabs off;