跳转到内容

算法实现/杂项/Base64

来自维基教科书,开放的书籍,开放的世界

传统的(MIME)base64 编码和解码过程相当简单,可以用 JavaScript 实现,包括在特定行长度下所需的 MIME/等行分隔符。值得注意的是,许多 base64 函数(例如在 PHP 中)返回没有行分隔符的 base64 编码字符串,因为行分隔符可以在编码后轻松插入,并且许多情况下,base64 编码仅用于通过 XML 安全传输数据或插入数据库等情况下, — 在这些情况下,已知换行符是不必要且因此不受欢迎的。如果不需要,可以在这些函数中轻松注释掉换行符插入和删除(它们在各自函数中都只有一行)。

编码需要一个 base 64 字符数组,例如

 var base64chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'.split("");

解码将需要逆列表(交换值索引),例如

var base64inv = {}; 
for (var i = 0; i < base64chars.length; i++) 
{ 
  base64inv[base64chars[i]] = i; 
}

请注意,在实际实现中,最好显式地列出上面每个列表的整个数组/哈希表 - 这里的一行代码是为了尽可能直观地演示概念,而不是实际应用中的理想情况。

Javascript

[编辑 | 编辑源代码]

base64 编码函数

function base64_encode (s)
{
  // the result/encoded string, the padding string, and the pad count
  var base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  var r = ""; 
  var p = ""; 
  var c = s.length % 3;

  // add a right zero pad to make this string a multiple of 3 characters
  if (c > 0) { 
    for (; c < 3; c++) { 
      p += '='; 
      s += "\0"; 
    } 
  }

  // increment over the length of the string, three characters at a time
  for (c = 0; c < s.length; c += 3) {

    // we add newlines after every 76 output characters, according to the MIME specs
    if (c > 0 && (c / 3 * 4) % 76 == 0) { 
      r += "\r\n"; 
    }

    // these three 8-bit (ASCII) characters become one 24-bit number
    var n = (s.charCodeAt(c) << 16) + (s.charCodeAt(c+1) << 8) + s.charCodeAt(c+2);

    // this 24-bit number gets separated into four 6-bit numbers
    n = [(n >>> 18) & 63, (n >>> 12) & 63, (n >>> 6) & 63, n & 63];

    // those four 6-bit numbers are used as indices into the base64 character list
    r += base64chars[n[0]] + base64chars[n[1]] + base64chars[n[2]] + base64chars[n[3]];
  }
   // add the actual padding string, after removing the zero pad
  return r.substring(0, r.length - p.length) + p;
}

base64 编码函数

class Base64Encode {
    private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

    public static String encode(String s) {

	// the result/encoded string, the padding string, and the pad count
	String r = "", p = "";
	int c = s.length() % 3;

	// add a right zero pad to make this string a multiple of 3 characters
	if (c > 0) {
	    for (; c < 3; c++) {
		p += "=";
		s += "\0";
	    }
	}

	// increment over the length of the string, three characters at a time
	for (c = 0; c < s.length(); c += 3) {

	    // we add newlines after every 76 output characters, according to
	    // the MIME specs
	    if (c > 0 && (c / 3 * 4) % 76 == 0)
		r += "\r\n";

	    // these three 8-bit (ASCII) characters become one 24-bit number
	    int n = (s.charAt(c) << 16) + (s.charAt(c + 1) << 8)
		    + (s.charAt(c + 2));

	    // this 24-bit number gets separated into four 6-bit numbers
	    int n1 = (n >> 18) & 63, n2 = (n >> 12) & 63, n3 = (n >> 6) & 63, n4 = n & 63;

	    // those four 6-bit numbers are used as indices into the base64
	    // character list
	    r += "" + base64chars.charAt(n1) + base64chars.charAt(n2)
		    + base64chars.charAt(n3) + base64chars.charAt(n4);
	}

	return r.substring(0, r.length() - p.length()) + p;
    }
}

另一种实现可以使用 Java 流。例如,可以创建一个 Base64EncoderStream,继承自 FilterOutputStream。

Base64 模块文档 展示了编码字符串的最短方法

irb(main):001:0> require "base64"
=> true
irb(main):002:0> enc   = Base64.encode64('Send reinforcements') 
=> "U2VuZCByZWluZm9yY2VtZW50cw==\n"
irb(main):003:0> plain = Base64.decode64(enc)                   
=> "Send reinforcements"

Ruby 版本,实现为一个工具,可以编码命令行中指定的特定文件

BASE64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
File.open(ARGV[0], 'rb') do |file| # read file name passed on command line
	c = 0 # count outputted characters
	while (buf = file.read(3)) != nil do # read up to 3 bytes at a time
		pad = '' # padding for this group of 4 characters
		while buf.length < 3
			buf << "\0"
			pad << '='
		end
		group24 = (buf[0] << 16) | (buf[1] << 8) | buf[2] # current 3 bytes as a 24 bit value
		encoded = BASE64_CHARS[(group24 >> 18) & 0x3f, 1] # read the 24 bit value 6 bits at a time
		encoded << BASE64_CHARS[(group24 >> 12) & 0x3f, 1]
		encoded << BASE64_CHARS[(group24 >> 6) & 0x3f, 1]
		encoded << BASE64_CHARS[(group24 >> 0) & 0x3f, 1]
		encoded[4 - pad.length, pad.length] = pad # add the padding
		print encoded
		print "\r\n" if (c += 4) % 76 == 0 # output newline ever 76th character
	end
end

base64 编码函数的 C 版本

#include <inttypes.h>
#include <string.h>

int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize)
{
   const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
   const uint8_t *data = (const uint8_t *)data_buf;
   size_t resultIndex = 0;
   size_t x;
   uint32_t n = 0;
   int padCount = dataLength % 3;
   uint8_t n0, n1, n2, n3;

   /* increment over the length of the string, three characters at a time */
   for (x = 0; x < dataLength; x += 3) 
   {
      /* these three 8-bit (ASCII) characters become one 24-bit number */
      n = ((uint32_t)data[x]) << 16; //parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
      
      if((x+1) < dataLength)
         n += ((uint32_t)data[x+1]) << 8;//parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
      
      if((x+2) < dataLength)
         n += data[x+2];

      /* this 24-bit number gets separated into four 6-bit numbers */
      n0 = (uint8_t)(n >> 18) & 63;
      n1 = (uint8_t)(n >> 12) & 63;
      n2 = (uint8_t)(n >> 6) & 63;
      n3 = (uint8_t)n & 63;
            
      /*
       * if we have one byte available, then its encoding is spread
       * out over two characters
       */
      if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
      result[resultIndex++] = base64chars[n0];
      if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
      result[resultIndex++] = base64chars[n1];

      /*
       * if we have only two bytes available, then their encoding is
       * spread out over three chars
       */
      if((x+1) < dataLength)
      {
         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
         result[resultIndex++] = base64chars[n2];
      }

      /*
       * if we have all three bytes available, then their encoding is spread
       * out over four characters
       */
      if((x+2) < dataLength)
      {
         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
         result[resultIndex++] = base64chars[n3];
      }
   }

   /*
    * create and add padding that is required if we did not have a multiple of 3
    * number of characters available
    */
   if (padCount > 0) 
   { 
      for (; padCount < 3; padCount++) 
      { 
         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
         result[resultIndex++] = '=';
      } 
   }
   if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
   result[resultIndex] = 0;
   return 0;   /* indicate success */
}

以下是 Base64Encode 的 C++ 版本。此代码发布到公有领域。

#include <string>
#include <vector>

// Prototype
// std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer);
// This line goes in header file

/* Define these if they aren't already in your environment
 * #define TEXT(x) Lx    //Unicode
 * #define TCHAR wchar_t //Unicode
 * #define TCHAR char    //Not unicode
 * #define TEXT(x) x     //Not unicode
 * #define DWORD long
 * #define BYTE unsigned char
 * They are defined by default in Windows.h
 */

//Lookup table for encoding
//If you want to use an alternate alphabet, change the characters here
const static TCHAR encodeLookup[] = TEXT("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
const static TCHAR padCharacter = TEXT('=');
std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer)
{
	std::basic_string<TCHAR> encodedString;
	encodedString.reserve(((inputBuffer.size()/3) + (inputBuffer.size() % 3 > 0)) * 4);
	DWORD temp;
	std::vector<BYTE>::iterator cursor = inputBuffer.begin();
	for(size_t idx = 0; idx < inputBuffer.size()/3; idx++)
	{
		temp  = (*cursor++) << 16; //Convert to big endian
		temp += (*cursor++) << 8;
		temp += (*cursor++);
		encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
		encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
		encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
		encodedString.append(1,encodeLookup[(temp & 0x0000003F)      ]);
	}
	switch(inputBuffer.size() % 3)
	{
	case 1:
		temp  = (*cursor++) << 16; //Convert to big endian
		encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
		encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
		encodedString.append(2,padCharacter);
		break;
	case 2:
		temp  = (*cursor++) << 16; //Convert to big endian
		temp += (*cursor++) << 8;
		encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
		encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
		encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
		encodedString.append(1,padCharacter);
		break;
	}
	return encodedString;
}
base64encode <- function(sobj) {
  sstr <- as.character(sobj)
  stopifnot(length(sstr) == 1) # we only like 1-entry string vectors for now
  if (nchar(sstr) == 0) return("")
  b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
  shfts <- c(18,12,6,0)
  sand <- function(n,s) bitwAnd(bitwShiftR(n,s),63)+1
  slft <- function(p,n) bitwShiftL(as.integer(p),n)
  subs <- function(s,n) substring(s,n,n)
  sbit <- charToRaw(sstr)  
  npad <- ( 3 - length(sbit) %% 3) %% 3 # yeah.
  sbit <- c(sbit,rep(as.raw(0),npad))
  pces <- lapply(seq(1,length(sbit),by=3),function(ii) sbit[ii:(ii+2)])
  encv <- paste(sapply(pces,function(p) paste0(sapply(shfts,function(s)(subs(b64c,sand(slft(p[1],16)+slft(p[2],8)+slft(p[3],0),s)))))),collapse="")
  if (npad > 0) substr(encv,nchar(encv)-npad+1,nchar(encv)) <- paste(rep("=",npad),collapse="")
  return(encv)
}

Javascript

[编辑 | 编辑源代码]

base64 解码函数

function base64_decode (s)
{
  var base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  // remove/ignore any characters not in the base64 characters list
  //  or the pad character -- particularly newlines
  s = s.replace(new RegExp('[^'+base64chars.split("")+'=]', 'g'), "");

  // replace any incoming padding with a zero pad (the 'A' character is zero)
  var p = (s.charAt(s.length-1) == '=' ? 
          (s.charAt(s.length-2) == '=' ? 'AA' : 'A') : ""); 
  var r = ""; 
  s = s.substr(0, s.length - p.length) + p;

  // increment over the length of this encoded string, four characters at a time
  for (var c = 0; c < s.length; c += 4) {

    // each of these four characters represents a 6-bit index in the base64 characters list
    //  which, when concatenated, will give the 24-bit number for the original 3 characters
    var n = (base64chars.indexOf(s.charAt(c)) << 18) + (base64chars.indexOf(s.charAt(c+1)) << 12) +
            (base64chars.indexOf(s.charAt(c+2)) << 6) + base64chars.indexOf(s.charAt(c+3));

    // split the 24-bit number into the original three 8-bit (ASCII) characters
    r += String.fromCharCode((n >>> 16) & 255, (n >>> 8) & 255, n & 255);
  }
   // remove any zero pad that was added to make this a multiple of 24 bits
  return r.substring(0, r.length - p.length);
}

上面的实现最适合 JavaScript 之类的语言,这些语言非常有效地处理任意长度字符串的字符串连接。其他语言(例如 C)可以通过为新字符串/数组分配适当大小的内存来更有效地工作(输出字符串长度可以很容易地在开头从输入字符串计算出来),然后只需设置每个字符索引,而不是连接。

base64 解码函数

class Base64Decode {
    private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";

    public static String decode(String s) {

	// remove/ignore any characters not in the base64 characters list
	// or the pad character -- particularly newlines
	s = s.replaceAll("[^" + base64chars + "=]", "");

	// replace any incoming padding with a zero pad (the 'A' character is
	// zero)
	String p = (s.charAt(s.length() - 1) == '=' ? 
		(s.charAt(s.length() - 2) == '=' ? "AA" : "A") : "");
	String r = "";
	s = s.substring(0, s.length() - p.length()) + p;

	// increment over the length of this encoded string, four characters
	// at a time
	for (int c = 0; c < s.length(); c += 4) {

	    // each of these four characters represents a 6-bit index in the
	    // base64 characters list which, when concatenated, will give the
	    // 24-bit number for the original 3 characters
	    int n = (base64chars.indexOf(s.charAt(c)) << 18)
		    + (base64chars.indexOf(s.charAt(c + 1)) << 12)
		    + (base64chars.indexOf(s.charAt(c + 2)) << 6)
		    + base64chars.indexOf(s.charAt(c + 3));

	    // split the 24-bit number into the original three 8-bit (ASCII)
	    // characters
	    r += "" + (char) ((n >>> 16) & 0xFF) + (char) ((n >>> 8) & 0xFF)
		    + (char) (n & 0xFF);
	}

	// remove any zero pad that was added to make this a multiple of 24 bits
	return r.substring(0, r.length() - p.length());
    }
}

另一种实现可以使用 Java 流。例如,可以创建一个 Base64DecoderStream,继承自 FilterInputStream。

上面编码器的 C 解码器对应物。此代码也是公有领域。此解决方案已使用指针数学和查找表进行了优化。该算法处理多种编码格式:有和没有换行符,有和没有空格,有和没有填充字符。

#define WHITESPACE 64
#define EQUALS     65
#define INVALID    66

static const unsigned char d[] = {
    66,66,66,66,66,66,66,66,66,66,64,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,62,66,66,66,63,52,53,
    54,55,56,57,58,59,60,61,66,66,66,65,66,66,66, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,66,66,66,66,66,66,26,27,28,
    29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66
};

int base64decode (char *in, size_t inLen, unsigned char *out, size_t *outLen) { 
    char *end = in + inLen;
    char iter = 0;
    uint32_t buf = 0;
    size_t len = 0;
    
    while (in < end) {
        unsigned char c = d[*in++];
        
        switch (c) {
        case WHITESPACE: continue;   /* skip whitespace */
        case INVALID:    return 1;   /* invalid input, return error */
        case EQUALS:                 /* pad character, end of data */
            in = end;
            continue;
        default:
            buf = buf << 6 | c;
            iter++; // increment the number of iteration
            /* If the buffer is full, split it into bytes */
            if (iter == 4) {
                if ((len += 3) > *outLen) return 1; /* buffer overflow */
                *(out++) = (buf >> 16) & 255;
                *(out++) = (buf >> 8) & 255;
                *(out++) = buf & 255;
                buf = 0; iter = 0;

            }   
        }
    }
   
    if (iter == 3) {
        if ((len += 2) > *outLen) return 1; /* buffer overflow */
        *(out++) = (buf >> 10) & 255;
        *(out++) = (buf >> 2) & 255;
    }
    else if (iter == 2) {
        if (++len > *outLen) return 1; /* buffer overflow */
        *(out++) = (buf >> 4) & 255;
    }

    *outLen = len; /* modify to reflect the actual output size */
    return 0;
}

上面编码器的 C++ 解码器对应物。此代码也是公有领域。虽然可以使用与编码中类似的表格的更通用解决方案,然后在该表格中找到我们想要使用的字符的索引,虽然该解决方案更简单,但使用 if-else-if 梯子的版本更快,因为它不需要运行那么多的字符比较来找到要使用的正确字符。但是,此代码中特定于字母的部分在大多数 base64 字母中几乎相同。

需要更改以适应 base64url 编码的项目在注释中说明。

#include <string>
#include <vector>
 
// Prototype
// std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input);
// This line goes in header file
 
/* Define these if they aren't already in your environment
 * #define TEXT(x) Lx    //Unicode
 * #define TCHAR wchar_t //Unicode
 * #define TCHAR char    //Not unicode
 * #define TEXT(x) x     //Not unicode
 * #define DWORD long
 * #define BYTE unsigned char
 * They are defined by default in Windows.h
 */
 
const static TCHAR padCharacter = TEXT('=');
std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input)
{
	if (input.length() % 4) //Sanity check
		throw std::runtime_error("Non-Valid base64!");
	size_t padding = 0;
	if (input.length())
	{
		if (input[input.length()-1] == padCharacter)
			padding++;
		if (input[input.length()-2] == padCharacter)
			padding++;
	}
	//Setup a vector to hold the result
	std::vector<BYTE> decodedBytes;
	decodedBytes.reserve(((input.length()/4)*3) - padding);
	DWORD temp=0; //Holds decoded quanta
	std::basic_string<TCHAR>::const_iterator cursor = input.begin();
	while (cursor < input.end())
	{
		for (size_t quantumPosition = 0; quantumPosition < 4; quantumPosition++)
		{
			temp <<= 6;
			if       (*cursor >= 0x41 && *cursor <= 0x5A) // This area will need tweaking if
				temp |= *cursor - 0x41;		              // you are using an alternate alphabet
			else if  (*cursor >= 0x61 && *cursor <= 0x7A)
				temp |= *cursor - 0x47;
			else if  (*cursor >= 0x30 && *cursor <= 0x39)
				temp |= *cursor + 0x04;
			else if  (*cursor == 0x2B)
				temp |= 0x3E; //change to 0x2D for URL alphabet
			else if  (*cursor == 0x2F)
				temp |= 0x3F; //change to 0x5F for URL alphabet
			else if  (*cursor == padCharacter) //pad
			{
				switch( input.end() - cursor )
				{
				case 1: //One pad character
					decodedBytes.push_back((temp >> 16) & 0x000000FF);
					decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
					return decodedBytes;
				case 2: //Two pad characters
					decodedBytes.push_back((temp >> 10) & 0x000000FF);
					return decodedBytes;
				default:
					throw std::runtime_error("Invalid Padding in Base 64!");
				}
			}  else
				throw std::runtime_error("Non-Valid Character in Base 64!");
			cursor++;
		}
		decodedBytes.push_back((temp >> 16) & 0x000000FF);
		decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
		decodedBytes.push_back((temp      ) & 0x000000FF);
	}
	return decodedBytes;
}
base64decode <- function(eobj) {
  estr <- as.character(eobj)
  stopifnot(length(estr) == 1)
  b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
  estr <- gsub(paste("[^",b64c,"=]"),"",estr)
  if (nchar(estr) == 0) return("")
  estr <- gsub("=","A",estr,fixed=T)
  ebit <- charToRaw(estr); b64r <- charToRaw(b64c)
  pces <- lapply(seq(1,length(ebit),by=4),function(ii) ebit[ii:(ii+3)])
  slft <- function(p,n) bitwShiftL(match(p,b64r)-1,n) # note the -1
  sand <- function(n,s) intToUtf8(bitwAnd(bitwShiftR(n,s),0xFF))
  return(paste(sapply(pces,function(p){
    n <- slft(p[1],18) + slft(p[2],12) + slft(p[3],6) + slft(p[4],0)
    c(sand(n,16),sand(n,8),sand(n,0))
  }),collapse=""))
}


华夏公益教科书