-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathencoding_detector.py
303 lines (255 loc) · 10.8 KB
/
encoding_detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import base64
import binascii
import re
import urllib.parse
def detect_encoding(data):
"""
Detect the encoding of the provided data
Args:
data (str): The data to analyze
Returns:
str or None: The detected encoding type or None if not detected
"""
# Check if it's empty
if not data or len(data) < 4:
return None
# Check for Base64 encoding
if is_base64(data):
return "base64"
# Check for Hex encoding
if is_hex(data):
return "hex"
# Check for URL encoding
if is_url_encoded(data):
return "url"
# Check for ASCII85 encoding
if is_ascii85(data):
return "ascii85"
# Check for Quoted-Printable encoding
if is_quoted_printable(data):
return "quoted-printable"
# No recognized encoding detected
return None
def is_base64(data):
"""Check if a string is Base64 encoded"""
# Trim leading/trailing whitespace
if isinstance(data, str):
data = data.strip()
# Minimum 24 characters for reasonable base64 data
if len(data) < 24:
return False
# Handle PDFs and other binary formats (check for common signatures after decoding)
try:
# Check if data is reasonably base64-like (allowing for some tolerance)
# Modified pattern for standard Base64 that allows for some non-base64 chars like line breaks
clean_data = re.sub(r'[\s\r\n]', '', data)
# Regular pattern for standard Base64 (allow line breaks by not using ^ and $)
standard_pattern = r'[A-Za-z0-9+/]+(=?=?=?)'
# Check if most of the string matches the pattern
match_ratio = len(re.findall(r'[A-Za-z0-9+/=]', clean_data)) / len(clean_data) if clean_data else 0
if match_ratio > 0.95: # At least 95% of chars should be base64 chars
# Try to decode it as base64
try:
# Add padding if needed
if len(clean_data) % 4 != 0:
clean_data += '=' * (4 - len(clean_data) % 4)
decoded = base64.b64decode(clean_data)
# Check for some common file signatures in decoded data
if (decoded.startswith(b'%PDF') or # PDF
decoded.startswith(b'PK\x03\x04') or # ZIP / Office docs
decoded.startswith(b'\xFF\xD8\xFF') or # JPEG
decoded.startswith(b'\x89PNG') or # PNG
decoded.startswith(b'GIF8') or # GIF
decoded.startswith(b'BM') or # BMP
decoded.startswith(b'ID3') or # MP3
decoded.startswith(b'\x00\x00\x00\x18ftypmp4')): # MP4
return True
# Try to detect if the decoded content is something meaningful
# For text content, check if it has reasonable character distribution
try:
text = decoded.decode('utf-8', errors='strict')
# Check if it looks like text and not random garbage
if re.search(r'[a-zA-Z0-9.,;:!?()}{"\' ]{10,}', text):
return True
except UnicodeDecodeError:
# Not UTF-8 text, could still be valid binary data
# If it's longer than 100 bytes, assume it might be valid binary
if len(decoded) > 100:
return True
except Exception:
pass
# URL-safe Base64 variant
url_safe_pattern = r'[A-Za-z0-9_-]+(=?=?=?)'
match_ratio = len(re.findall(r'[A-Za-z0-9_\-=]', clean_data)) / len(clean_data) if clean_data else 0
if match_ratio > 0.95: # At least 95% of chars should be base64 chars
try:
# Add padding if needed
if len(clean_data) % 4 != 0:
clean_data += '=' * (4 - len(clean_data) % 4)
decoded = base64.urlsafe_b64decode(clean_data)
# Same checks as above
if (decoded.startswith(b'%PDF') or # PDF
decoded.startswith(b'PK\x03\x04') or # ZIP / Office docs
decoded.startswith(b'\xFF\xD8\xFF') or # JPEG
decoded.startswith(b'\x89PNG') or # PNG
decoded.startswith(b'GIF8') or # GIF
decoded.startswith(b'BM') or # BMP
decoded.startswith(b'ID3') or # MP3
decoded.startswith(b'\x00\x00\x00\x18ftypmp4')): # MP4
return True
try:
text = decoded.decode('utf-8', errors='strict')
if re.search(r'[a-zA-Z0-9.,;:!?()}{"\' ]{10,}', text):
return True
except UnicodeDecodeError:
if len(decoded) > 100:
return True
except Exception:
pass
except Exception:
pass
return False
def is_hex(data):
"""Check if a string is Hex encoded"""
# Regular pattern for hexadecimal strings
pattern = r'^[0-9A-Fa-f]+$'
# Check if the string matches the pattern and has an even length
if re.match(pattern, data) and len(data) % 2 == 0:
# Try to decode it as hex
try:
decoded = binascii.unhexlify(data)
return True
except Exception:
pass
return False
def is_url_encoded(data):
"""Check if a string is URL encoded"""
# Look for percent-encoded characters
if '%' in data and re.search(r'%[0-9A-Fa-f]{2}', data):
# Try to decode it as URL-encoded
try:
decoded = urllib.parse.unquote(data)
# If the decoded string is different from the original, it was likely URL-encoded
return decoded != data
except Exception:
pass
return False
def is_ascii85(data):
"""Check if a string is ASCII85 encoded"""
# Regular pattern for ASCII85
pattern = r'^<~[!-u]+~>$'
if re.match(pattern, data):
try:
# Strip the delimiters and try to decode
content = data[2:-2]
decoded = base64.a85decode(content)
return True
except Exception:
pass
return False
def is_quoted_printable(data):
"""Check if a string is Quoted-Printable encoded"""
# Look for the =XX pattern typical of Quoted-Printable
if '=' in data and re.search(r'=[0-9A-F]{2}', data):
try:
import quopri
decoded = quopri.decodestring(data.encode()).decode()
# If the decoded string is different from the original, it was likely QP-encoded
return decoded != data
except Exception:
pass
return False
def decode_content(data, encoding_type):
"""
Decode content based on its encoding type
Args:
data (str): The data to decode
encoding_type (str): The encoding type
Returns:
bytes: The decoded content
"""
if encoding_type == "base64":
try:
# Special handling for PDFs - PDF signature in base64 starts with JVBERi0
if data.strip().startswith('JVBERi0'):
print("Detected PDF in base64 format - using precise handling")
# Clean the data - remove whitespace, newlines
clean_data = re.sub(r'[\s\r\n]', '', data)
# Remove any non-base64 characters (more aggressive cleaning for problematic inputs)
clean_data = re.sub(r'[^A-Za-z0-9+/=]', '', clean_data)
# Add padding if needed
if len(clean_data) % 4 != 0:
clean_data += '=' * (4 - len(clean_data) % 4)
# Try to decode with standard base64
try:
decoded = base64.b64decode(clean_data)
# Check if it's a PDF
if decoded.startswith(b'%PDF'):
print("Successfully decoded PDF content")
return decoded
except Exception as e1:
print(f"Standard base64 decode failed: {str(e1)}")
# Try with URL-safe Base64 variant
try:
decoded = base64.urlsafe_b64decode(clean_data)
if decoded.startswith(b'%PDF'):
print("Successfully decoded PDF content with URL-safe base64")
return decoded
except Exception as e2:
print(f"URL-safe base64 decode failed: {str(e2)}")
# As a last resort, try with original data
try:
return base64.b64decode(data)
except Exception as e3:
print(f"Original data base64 decode failed: {str(e3)}")
raise
except Exception as e:
# Log the error for debugging
print(f"Base64 decode error: {str(e)}")
# Return the original data as bytes if decoding fails
if isinstance(data, str):
return data.encode()
return data
elif encoding_type == "hex":
try:
# Remove any whitespace
clean_data = re.sub(r'\s', '', data)
return binascii.unhexlify(clean_data)
except Exception as e:
print(f"Hex decode error: {str(e)}")
if isinstance(data, str):
return data.encode()
return data
elif encoding_type == "url":
try:
decoded = urllib.parse.unquote(data)
return decoded.encode()
except Exception as e:
print(f"URL decode error: {str(e)}")
if isinstance(data, str):
return data.encode()
return data
elif encoding_type == "ascii85":
try:
# Strip the delimiters if present
if data.startswith('<~') and data.endswith('~>'):
data = data[2:-2]
return base64.a85decode(data)
except Exception as e:
print(f"ASCII85 decode error: {str(e)}")
if isinstance(data, str):
return data.encode()
return data
elif encoding_type == "quoted-printable":
try:
import quopri
return quopri.decodestring(data.encode())
except Exception as e:
print(f"Quoted-printable decode error: {str(e)}")
if isinstance(data, str):
return data.encode()
return data
# Default case - return the data as bytes
if isinstance(data, str):
return data.encode()
return data