算法实现/杂项/Base64
传统的(MIME)base64 编码和解码过程相当简单,可以用 JavaScript 实现,包括在特定行长度下所需的 MIME/等行分隔符。值得注意的是,许多 base64 函数(例如在 PHP 中)返回没有行分隔符的 base64 编码字符串,因为行分隔符可以在编码后轻松插入,并且许多情况下,base64 编码仅用于通过 XML 安全传输数据或插入数据库等情况下, — 在这些情况下,已知换行符是不必要且因此不受欢迎的。如果不需要,可以在这些函数中轻松注释掉换行符插入和删除(它们在各自函数中都只有一行)。
编码需要一个 base 64 字符数组,例如
var base64chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'.split("");
解码将需要逆列表(交换值索引),例如
var base64inv = {};
for (var i = 0; i < base64chars.length; i++)
{
base64inv[base64chars[i]] = i;
}
请注意,在实际实现中,最好显式地列出上面每个列表的整个数组/哈希表 - 这里的一行代码是为了尽可能直观地演示概念,而不是实际应用中的理想情况。
base64 编码函数
function base64_encode (s)
{
// the result/encoded string, the padding string, and the pad count
var base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
var r = "";
var p = "";
var c = s.length % 3;
// add a right zero pad to make this string a multiple of 3 characters
if (c > 0) {
for (; c < 3; c++) {
p += '=';
s += "\0";
}
}
// increment over the length of the string, three characters at a time
for (c = 0; c < s.length; c += 3) {
// we add newlines after every 76 output characters, according to the MIME specs
if (c > 0 && (c / 3 * 4) % 76 == 0) {
r += "\r\n";
}
// these three 8-bit (ASCII) characters become one 24-bit number
var n = (s.charCodeAt(c) << 16) + (s.charCodeAt(c+1) << 8) + s.charCodeAt(c+2);
// this 24-bit number gets separated into four 6-bit numbers
n = [(n >>> 18) & 63, (n >>> 12) & 63, (n >>> 6) & 63, n & 63];
// those four 6-bit numbers are used as indices into the base64 character list
r += base64chars[n[0]] + base64chars[n[1]] + base64chars[n[2]] + base64chars[n[3]];
}
// add the actual padding string, after removing the zero pad
return r.substring(0, r.length - p.length) + p;
}
base64 编码函数
class Base64Encode {
private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
public static String encode(String s) {
// the result/encoded string, the padding string, and the pad count
String r = "", p = "";
int c = s.length() % 3;
// add a right zero pad to make this string a multiple of 3 characters
if (c > 0) {
for (; c < 3; c++) {
p += "=";
s += "\0";
}
}
// increment over the length of the string, three characters at a time
for (c = 0; c < s.length(); c += 3) {
// we add newlines after every 76 output characters, according to
// the MIME specs
if (c > 0 && (c / 3 * 4) % 76 == 0)
r += "\r\n";
// these three 8-bit (ASCII) characters become one 24-bit number
int n = (s.charAt(c) << 16) + (s.charAt(c + 1) << 8)
+ (s.charAt(c + 2));
// this 24-bit number gets separated into four 6-bit numbers
int n1 = (n >> 18) & 63, n2 = (n >> 12) & 63, n3 = (n >> 6) & 63, n4 = n & 63;
// those four 6-bit numbers are used as indices into the base64
// character list
r += "" + base64chars.charAt(n1) + base64chars.charAt(n2)
+ base64chars.charAt(n3) + base64chars.charAt(n4);
}
return r.substring(0, r.length() - p.length()) + p;
}
}
另一种实现可以使用 Java 流。例如,可以创建一个 Base64EncoderStream,继承自 FilterOutputStream。
Base64 模块文档 展示了编码字符串的最短方法
irb(main):001:0> require "base64"
=> true
irb(main):002:0> enc = Base64.encode64('Send reinforcements')
=> "U2VuZCByZWluZm9yY2VtZW50cw==\n"
irb(main):003:0> plain = Base64.decode64(enc)
=> "Send reinforcements"
Ruby 版本,实现为一个工具,可以编码命令行中指定的特定文件
BASE64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
File.open(ARGV[0], 'rb') do |file| # read file name passed on command line
c = 0 # count outputted characters
while (buf = file.read(3)) != nil do # read up to 3 bytes at a time
pad = '' # padding for this group of 4 characters
while buf.length < 3
buf << "\0"
pad << '='
end
group24 = (buf[0] << 16) | (buf[1] << 8) | buf[2] # current 3 bytes as a 24 bit value
encoded = BASE64_CHARS[(group24 >> 18) & 0x3f, 1] # read the 24 bit value 6 bits at a time
encoded << BASE64_CHARS[(group24 >> 12) & 0x3f, 1]
encoded << BASE64_CHARS[(group24 >> 6) & 0x3f, 1]
encoded << BASE64_CHARS[(group24 >> 0) & 0x3f, 1]
encoded[4 - pad.length, pad.length] = pad # add the padding
print encoded
print "\r\n" if (c += 4) % 76 == 0 # output newline ever 76th character
end
end
base64 编码函数的 C 版本
#include <inttypes.h>
#include <string.h>
int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize)
{
const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const uint8_t *data = (const uint8_t *)data_buf;
size_t resultIndex = 0;
size_t x;
uint32_t n = 0;
int padCount = dataLength % 3;
uint8_t n0, n1, n2, n3;
/* increment over the length of the string, three characters at a time */
for (x = 0; x < dataLength; x += 3)
{
/* these three 8-bit (ASCII) characters become one 24-bit number */
n = ((uint32_t)data[x]) << 16; //parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
if((x+1) < dataLength)
n += ((uint32_t)data[x+1]) << 8;//parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
if((x+2) < dataLength)
n += data[x+2];
/* this 24-bit number gets separated into four 6-bit numbers */
n0 = (uint8_t)(n >> 18) & 63;
n1 = (uint8_t)(n >> 12) & 63;
n2 = (uint8_t)(n >> 6) & 63;
n3 = (uint8_t)n & 63;
/*
* if we have one byte available, then its encoding is spread
* out over two characters
*/
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n0];
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n1];
/*
* if we have only two bytes available, then their encoding is
* spread out over three chars
*/
if((x+1) < dataLength)
{
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n2];
}
/*
* if we have all three bytes available, then their encoding is spread
* out over four characters
*/
if((x+2) < dataLength)
{
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n3];
}
}
/*
* create and add padding that is required if we did not have a multiple of 3
* number of characters available
*/
if (padCount > 0)
{
for (; padCount < 3; padCount++)
{
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = '=';
}
}
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex] = 0;
return 0; /* indicate success */
}
以下是 Base64Encode 的 C++ 版本。此代码发布到公有领域。
#include <string>
#include <vector>
// Prototype
// std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer);
// This line goes in header file
/* Define these if they aren't already in your environment
* #define TEXT(x) Lx //Unicode
* #define TCHAR wchar_t //Unicode
* #define TCHAR char //Not unicode
* #define TEXT(x) x //Not unicode
* #define DWORD long
* #define BYTE unsigned char
* They are defined by default in Windows.h
*/
//Lookup table for encoding
//If you want to use an alternate alphabet, change the characters here
const static TCHAR encodeLookup[] = TEXT("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
const static TCHAR padCharacter = TEXT('=');
std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer)
{
std::basic_string<TCHAR> encodedString;
encodedString.reserve(((inputBuffer.size()/3) + (inputBuffer.size() % 3 > 0)) * 4);
DWORD temp;
std::vector<BYTE>::iterator cursor = inputBuffer.begin();
for(size_t idx = 0; idx < inputBuffer.size()/3; idx++)
{
temp = (*cursor++) << 16; //Convert to big endian
temp += (*cursor++) << 8;
temp += (*cursor++);
encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
encodedString.append(1,encodeLookup[(temp & 0x0000003F) ]);
}
switch(inputBuffer.size() % 3)
{
case 1:
temp = (*cursor++) << 16; //Convert to big endian
encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
encodedString.append(2,padCharacter);
break;
case 2:
temp = (*cursor++) << 16; //Convert to big endian
temp += (*cursor++) << 8;
encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
encodedString.append(1,padCharacter);
break;
}
return encodedString;
}
base64encode <- function(sobj) {
sstr <- as.character(sobj)
stopifnot(length(sstr) == 1) # we only like 1-entry string vectors for now
if (nchar(sstr) == 0) return("")
b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
shfts <- c(18,12,6,0)
sand <- function(n,s) bitwAnd(bitwShiftR(n,s),63)+1
slft <- function(p,n) bitwShiftL(as.integer(p),n)
subs <- function(s,n) substring(s,n,n)
sbit <- charToRaw(sstr)
npad <- ( 3 - length(sbit) %% 3) %% 3 # yeah.
sbit <- c(sbit,rep(as.raw(0),npad))
pces <- lapply(seq(1,length(sbit),by=3),function(ii) sbit[ii:(ii+2)])
encv <- paste(sapply(pces,function(p) paste0(sapply(shfts,function(s)(subs(b64c,sand(slft(p[1],16)+slft(p[2],8)+slft(p[3],0),s)))))),collapse="")
if (npad > 0) substr(encv,nchar(encv)-npad+1,nchar(encv)) <- paste(rep("=",npad),collapse="")
return(encv)
}
base64 解码函数
function base64_decode (s)
{
var base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
// remove/ignore any characters not in the base64 characters list
// or the pad character -- particularly newlines
s = s.replace(new RegExp('[^'+base64chars.split("")+'=]', 'g'), "");
// replace any incoming padding with a zero pad (the 'A' character is zero)
var p = (s.charAt(s.length-1) == '=' ?
(s.charAt(s.length-2) == '=' ? 'AA' : 'A') : "");
var r = "";
s = s.substr(0, s.length - p.length) + p;
// increment over the length of this encoded string, four characters at a time
for (var c = 0; c < s.length; c += 4) {
// each of these four characters represents a 6-bit index in the base64 characters list
// which, when concatenated, will give the 24-bit number for the original 3 characters
var n = (base64chars.indexOf(s.charAt(c)) << 18) + (base64chars.indexOf(s.charAt(c+1)) << 12) +
(base64chars.indexOf(s.charAt(c+2)) << 6) + base64chars.indexOf(s.charAt(c+3));
// split the 24-bit number into the original three 8-bit (ASCII) characters
r += String.fromCharCode((n >>> 16) & 255, (n >>> 8) & 255, n & 255);
}
// remove any zero pad that was added to make this a multiple of 24 bits
return r.substring(0, r.length - p.length);
}
上面的实现最适合 JavaScript 之类的语言,这些语言非常有效地处理任意长度字符串的字符串连接。其他语言(例如 C)可以通过为新字符串/数组分配适当大小的内存来更有效地工作(输出字符串长度可以很容易地在开头从输入字符串计算出来),然后只需设置每个字符索引,而不是连接。
base64 解码函数
class Base64Decode {
private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
public static String decode(String s) {
// remove/ignore any characters not in the base64 characters list
// or the pad character -- particularly newlines
s = s.replaceAll("[^" + base64chars + "=]", "");
// replace any incoming padding with a zero pad (the 'A' character is
// zero)
String p = (s.charAt(s.length() - 1) == '=' ?
(s.charAt(s.length() - 2) == '=' ? "AA" : "A") : "");
String r = "";
s = s.substring(0, s.length() - p.length()) + p;
// increment over the length of this encoded string, four characters
// at a time
for (int c = 0; c < s.length(); c += 4) {
// each of these four characters represents a 6-bit index in the
// base64 characters list which, when concatenated, will give the
// 24-bit number for the original 3 characters
int n = (base64chars.indexOf(s.charAt(c)) << 18)
+ (base64chars.indexOf(s.charAt(c + 1)) << 12)
+ (base64chars.indexOf(s.charAt(c + 2)) << 6)
+ base64chars.indexOf(s.charAt(c + 3));
// split the 24-bit number into the original three 8-bit (ASCII)
// characters
r += "" + (char) ((n >>> 16) & 0xFF) + (char) ((n >>> 8) & 0xFF)
+ (char) (n & 0xFF);
}
// remove any zero pad that was added to make this a multiple of 24 bits
return r.substring(0, r.length() - p.length());
}
}
另一种实现可以使用 Java 流。例如,可以创建一个 Base64DecoderStream,继承自 FilterInputStream。
上面编码器的 C 解码器对应物。此代码也是公有领域。此解决方案已使用指针数学和查找表进行了优化。该算法处理多种编码格式:有和没有换行符,有和没有空格,有和没有填充字符。
#define WHITESPACE 64
#define EQUALS 65
#define INVALID 66
static const unsigned char d[] = {
66,66,66,66,66,66,66,66,66,66,64,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,62,66,66,66,63,52,53,
54,55,56,57,58,59,60,61,66,66,66,65,66,66,66, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,66,66,66,66,66,66,26,27,28,
29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66
};
int base64decode (char *in, size_t inLen, unsigned char *out, size_t *outLen) {
char *end = in + inLen;
char iter = 0;
uint32_t buf = 0;
size_t len = 0;
while (in < end) {
unsigned char c = d[*in++];
switch (c) {
case WHITESPACE: continue; /* skip whitespace */
case INVALID: return 1; /* invalid input, return error */
case EQUALS: /* pad character, end of data */
in = end;
continue;
default:
buf = buf << 6 | c;
iter++; // increment the number of iteration
/* If the buffer is full, split it into bytes */
if (iter == 4) {
if ((len += 3) > *outLen) return 1; /* buffer overflow */
*(out++) = (buf >> 16) & 255;
*(out++) = (buf >> 8) & 255;
*(out++) = buf & 255;
buf = 0; iter = 0;
}
}
}
if (iter == 3) {
if ((len += 2) > *outLen) return 1; /* buffer overflow */
*(out++) = (buf >> 10) & 255;
*(out++) = (buf >> 2) & 255;
}
else if (iter == 2) {
if (++len > *outLen) return 1; /* buffer overflow */
*(out++) = (buf >> 4) & 255;
}
*outLen = len; /* modify to reflect the actual output size */
return 0;
}
上面编码器的 C++ 解码器对应物。此代码也是公有领域。虽然可以使用与编码中类似的表格的更通用解决方案,然后在该表格中找到我们想要使用的字符的索引,虽然该解决方案更简单,但使用 if-else-if 梯子的版本更快,因为它不需要运行那么多的字符比较来找到要使用的正确字符。但是,此代码中特定于字母的部分在大多数 base64 字母中几乎相同。
需要更改以适应 base64url 编码的项目在注释中说明。
#include <string>
#include <vector>
// Prototype
// std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input);
// This line goes in header file
/* Define these if they aren't already in your environment
* #define TEXT(x) Lx //Unicode
* #define TCHAR wchar_t //Unicode
* #define TCHAR char //Not unicode
* #define TEXT(x) x //Not unicode
* #define DWORD long
* #define BYTE unsigned char
* They are defined by default in Windows.h
*/
const static TCHAR padCharacter = TEXT('=');
std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input)
{
if (input.length() % 4) //Sanity check
throw std::runtime_error("Non-Valid base64!");
size_t padding = 0;
if (input.length())
{
if (input[input.length()-1] == padCharacter)
padding++;
if (input[input.length()-2] == padCharacter)
padding++;
}
//Setup a vector to hold the result
std::vector<BYTE> decodedBytes;
decodedBytes.reserve(((input.length()/4)*3) - padding);
DWORD temp=0; //Holds decoded quanta
std::basic_string<TCHAR>::const_iterator cursor = input.begin();
while (cursor < input.end())
{
for (size_t quantumPosition = 0; quantumPosition < 4; quantumPosition++)
{
temp <<= 6;
if (*cursor >= 0x41 && *cursor <= 0x5A) // This area will need tweaking if
temp |= *cursor - 0x41; // you are using an alternate alphabet
else if (*cursor >= 0x61 && *cursor <= 0x7A)
temp |= *cursor - 0x47;
else if (*cursor >= 0x30 && *cursor <= 0x39)
temp |= *cursor + 0x04;
else if (*cursor == 0x2B)
temp |= 0x3E; //change to 0x2D for URL alphabet
else if (*cursor == 0x2F)
temp |= 0x3F; //change to 0x5F for URL alphabet
else if (*cursor == padCharacter) //pad
{
switch( input.end() - cursor )
{
case 1: //One pad character
decodedBytes.push_back((temp >> 16) & 0x000000FF);
decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
return decodedBytes;
case 2: //Two pad characters
decodedBytes.push_back((temp >> 10) & 0x000000FF);
return decodedBytes;
default:
throw std::runtime_error("Invalid Padding in Base 64!");
}
} else
throw std::runtime_error("Non-Valid Character in Base 64!");
cursor++;
}
decodedBytes.push_back((temp >> 16) & 0x000000FF);
decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
decodedBytes.push_back((temp ) & 0x000000FF);
}
return decodedBytes;
}
base64decode <- function(eobj) {
estr <- as.character(eobj)
stopifnot(length(estr) == 1)
b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
estr <- gsub(paste("[^",b64c,"=]"),"",estr)
if (nchar(estr) == 0) return("")
estr <- gsub("=","A",estr,fixed=T)
ebit <- charToRaw(estr); b64r <- charToRaw(b64c)
pces <- lapply(seq(1,length(ebit),by=4),function(ii) ebit[ii:(ii+3)])
slft <- function(p,n) bitwShiftL(match(p,b64r)-1,n) # note the -1
sand <- function(n,s) intToUtf8(bitwAnd(bitwShiftR(n,s),0xFF))
return(paste(sapply(pces,function(p){
n <- slft(p[1],18) + slft(p[2],12) + slft(p[3],6) + slft(p[4],0)
c(sand(n,16),sand(n,8),sand(n,0))
}),collapse=""))
}