waldhacker / pseudify-profile-templates
pseudify的数据库匿名化工具的模板
0.0.2
2023-02-27 14:10 UTC
Requires
- egulias/email-validator: ^3.2
Requires (Dev)
- ergebnis/composer-normalize: ^2.28
- friendsofphp/php-cs-fixer: ^3.13
- phpstan/phpstan: ^1.9
- roave/security-advisories: dev-latest
- symfony/flex: ^2.2
- vimeo/psalm: ^4.30
README
Pseudify 是一个帮助您匿名化数据库数据的工具箱。
您可以在数据库中找到隐藏的个人数据,并将它们进行匿名化。
🎉 从任何应用程序中分析并匿名化受支持的数据库
🎉 查找隐藏的个人数据
🎉 数据完整性:相同的输入数据在整个数据库列中生成相同的匿名数据
🎉 容易编码的数据分析和匿名化
🎉 多编码数据分析和匿名化
🎉 分析和匿名化复杂数据结构,如JSON或序列化PHP数据
🎉 分析和匿名化动态数据
🎉 12个内置解码器/编码器
🎉 通过自定义解码器/编码器进行扩展
🎉 由于 FakerPHP,提供100多个内置本地化假数据格式
🎉 通过自定义假数据格式进行扩展
🎉 由于 Doctrine DBAL,支持7个内置数据库平台
🎉 通过自定义数据库平台进行扩展
🎉 在PHP中建模配置文件
安装
docker run -it -v $(pwd):/app -u $(id -u):$(id -g) \ composer create-project --no-dev --remove-vcs waldhacker/pseudify-profile-templates .
启动数据库服务器
docker network create pseudify-net docker run --rm --detach \ --network pseudify-net \ --name mariadb_10_5 \ --env MARIADB_USER=pseudify \ --env MARIADB_PASSWORD='pseudify(!)w4ldh4ck3r' \ --env MARIADB_ROOT_PASSWORD='pseudify(!)w4ldh4ck3r' \ --env MARIADB_DATABASE=pseudify_utf8mb4 \ -v $(pwd)/tests/mariadb/10.5:/docker-entrypoint-initdb.d \ mariadb:10.5
配置pseudify
cp tests/mariadb/10.5/.env .env
启动pseudify
docker run -it -v $(pwd):/data --network=pseudify-net \
ghcr.io/waldhacker/pseudify pseudify:debug:table_schema
匿名化建模快速指南
我想匿名化我的用户表
嗯,这很简单
<?php
namespace Waldhacker\Pseudify\Profiles;
use Waldhacker\Pseudify\Core\Processor\Processing\Pseudonymize\DataManipulatorPreset;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\Column;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\TableDefinition;
use Waldhacker\Pseudify\Core\Profile\Pseudonymize\ProfileInterface;
class MyPseudonymizeProfile implements ProfileInterface
{
public function getIdentifier(): string
{
return 'my-custom-app';
}
public function getTableDefinition(): TableDefinition
{
$tableDefinition = new TableDefinition($this->getIdentifier());
$tableDefinition
->addTable(table: 'users', columns: [
// Replace the values in this column with fake data generated by FakerPHP/Faker formatter "userName"
Column::create('username')->addDataProcessing(DataManipulatorPreset::scalarData('userName')),
// Replace the values in this column with fake data generated by FakerPHP/Faker formatter "argon2iPassword"
Column::create('password')->addDataProcessing(DataManipulatorPreset::scalarData('argon2iPassword')),
// Replace the values in this column with fake data generated by FakerPHP/Faker formatter "safeEmail"
Column::create('email')->addDataProcessing(DataManipulatorPreset::scalarData('safeEmail')),
])
;
return $tableDefinition;
}
}
... 但名字列是十六进制编码的(出于某种原因)
没问题
<?php
namespace Waldhacker\Pseudify\Profiles;
use Waldhacker\Pseudify\Core\Processor\Processing\Pseudonymize\DataManipulatorPreset;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\Column;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\TableDefinition;
use Waldhacker\Pseudify\Core\Profile\Pseudonymize\ProfileInterface;
class MyPseudonymizeProfile implements ProfileInterface
{
public function getIdentifier(): string
{
return 'my-custom-app';
}
public function getTableDefinition(): TableDefinition
{
$tableDefinition = new TableDefinition($this->getIdentifier());
$tableDefinition
->addTable(table: 'users', columns: [
// ...
// -> Read the value from the database and decode the value in the column from hex to decimal
// -> Replace the values in this column with fake data generated by FakerPHP/Faker formatter "firstName"
// -> Encode the replaced value from decimal to hex and save the data back to the database
Column::create('first_name', Column::DATA_TYPE_HEX)->addDataProcessing(DataManipulatorPreset::scalarData('firstName')),
])
;
return $tableDefinition;
}
}
... 但姓氏列被zlib压缩,然后以base64编码的形式存储在数据库中(原因更加神秘)
嗯,我们可以处理这个问题
<?php
namespace Waldhacker\Pseudify\Profiles;
use Waldhacker\Pseudify\Core\Processor\Encoder\Base64Encoder;
use Waldhacker\Pseudify\Core\Processor\Encoder\ChainedEncoder;
use Waldhacker\Pseudify\Core\Processor\Encoder\GzEncodeEncoder;
use Waldhacker\Pseudify\Core\Processor\Processing\Pseudonymize\DataManipulatorPreset;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\Column;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\TableDefinition;
use Waldhacker\Pseudify\Core\Profile\Pseudonymize\ProfileInterface;
class MyPseudonymizeProfile implements ProfileInterface
{
public function getIdentifier(): string
{
return 'my-custom-app';
}
public function getTableDefinition(): TableDefinition
{
$tableDefinition = new TableDefinition($this->getIdentifier());
$tableDefinition
->addTable(table: 'users', columns: [
// ...
// -> Read the value from the database and decode the value in the column from base64 to binary
// and then decompress the binary using zlib
// -> Replace the values in this column with fake data generated by FakerPHP/Faker formatter "lastName"
// -> Compress the replaced value using zlib and then encode the value as base64
// and save the data back to the database
Column::create('last_name')
->setEncoder(new ChainedEncoder([
new Base64Encoder(),
new GzEncodeEncoder([
GzEncodeEncoder::ENCODE_LEVEL => 5,
GzEncodeEncoder::ENCODE_ENCODING => ZLIB_ENCODING_GZIP,
])
]))
->addDataProcessing(DataManipulatorPreset::scalarData('lastName')),
])
;
return $tableDefinition;
}
}
... 但"payload"列的内容取决于"payload_type"列的值
为什么不试试,来吧
<?php
namespace Waldhacker\Pseudify\Profiles;
use Waldhacker\Pseudify\Core\Processor\Processing\DataProcessing;
use Waldhacker\Pseudify\Core\Processor\Processing\Pseudonymize\DataManipulatorContext;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\Column;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\TableDefinition;
use Waldhacker\Pseudify\Core\Profile\Pseudonymize\ProfileInterface;
class MyPseudonymizeProfile implements ProfileInterface
{
public function getIdentifier(): string
{
return 'my-custom-app';
}
public function getTableDefinition(): TableDefinition
{
$tableDefinition = new TableDefinition($this->getIdentifier());
$tableDefinition
->addTable(table: 'users', columns: [
// ...
Column::create('payload')
->addDataProcessing(new DataProcessing(function (DataManipulatorContext $context): void {
// The data of the column payload
$payload = $context->getProcessedData();
// The data of all columns
$databaseRow = $context->getDatebaseRow();
// Process the data in different ways depending
// on the content of the column payload_type
if ('foo' === $databaseRow['payload_type']) {
// replace payload with some fake username
$payload = $context->fake($payload)->userName();
} else {
// replace payload with some fake e-mail address
$payload = $context->fake($payload)->safeEmail();
}
// Replace the value in this column with the pseudonymized data
$context->setProcessedData($payload);
})),
])
;
return $tableDefinition;
}
}
... 但"session_data"列的内容包含复杂的数据格式,在这种情况下是序列化PHP数据
这比想象中要简单
<?php
namespace Waldhacker\Pseudify\Profiles;
use Waldhacker\Pseudify\Core\Processor\Encoder\Serialized\Node\StringNode;
use Waldhacker\Pseudify\Core\Processor\Processing\DataProcessing;
use Waldhacker\Pseudify\Core\Processor\Processing\Pseudonymize\DataManipulatorContext;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\Column;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\TableDefinition;
use Waldhacker\Pseudify\Core\Profile\Pseudonymize\ProfileInterface;
class MyPseudonymizeProfile implements ProfileInterface
{
public function getIdentifier(): string
{
return 'my-custom-app';
}
public function getTableDefinition(): TableDefinition
{
$tableDefinition = new TableDefinition($this->getIdentifier());
$tableDefinition
->addTable(table: 'users', columns: [
// ...
// -> Read the value from the database and decode the
// serialized PHP value as an AST
// -> Replace some values in the AST by hand with fake data
// -> Transform the AST back into a serialized PHP value
// and save the data back to the database
Column::create('session_data', Column::DATA_TYPE_SERIALIZED)
->addDataProcessing(new DataProcessing(function (DataManipulatorContext $context): void {
// The plain data looks like this
// a:2:{i:0;s:14:"243.202.241.67";s:4:"user";O:8:"stdClass":5:{s:8:"userName";s:11:"georgiana59";s:8:"lastName";s:5:"Block";s:5:"email";s:19:"nolan11@example.net";s:2:"id";i:2;s:4:"user";R:3;}}
// $sessionDataNode contains the session data as an AST
$sessionDataNode = $context->getDecodedData();
// Collect the original data, which must be pseudonymized
$ip = $sessionDataNode->getPropertyContent(0)->getValue();
$userNode = $sessionDataNode->getPropertyContent('user');
$userName = $userNode->getPropertyContent('userName')->getValue();
$lastName = $userNode->getPropertyContent('lastName')->getValue();
$email = $userNode->getPropertyContent('email')->getValue();
// Collect the pseudonymized data
$fakeIp = $context->fake($ip)->ipv4();
$fakeUserName = $context->fake($userName)->userName();
$fakeLastName = $context->fake($lastName)->lastName();
$fakeEmail = $context->fake($email)->safeEmail();
// Pseudonymize the `$sessionDataNode` items
$sessionDataNode
->replaceProperty(0, new StringNode($fakeIp))
->getPropertyContent('user')
->replaceProperty('userName', new StringNode($fakeUserName))
->replaceProperty('lastName', new StringNode($fakeLastName))
->replaceProperty('email', new StringNode($fakeEmail))
;
// Replace the value in this column with the pseudonymized data
$context->setProcessedData($sessionDataNode);
})),
])
;
return $tableDefinition;
}
}
... 但"session_data"列的内容包含复杂的数据格式,在这种情况下是JSON字符串
好吧,就这样做
<?php
namespace Waldhacker\Pseudify\Profiles;
use Waldhacker\Pseudify\Core\Processor\Processing\DataProcessing;
use Waldhacker\Pseudify\Core\Processor\Processing\Pseudonymize\DataManipulatorContext;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\Column;
use Waldhacker\Pseudify\Core\Profile\Model\Pseudonymize\TableDefinition;
use Waldhacker\Pseudify\Core\Profile\Pseudonymize\ProfileInterface;
class MyPseudonymizeProfile implements ProfileInterface
{
public function getIdentifier(): string
{
return 'my-custom-app';
}
public function getTableDefinition(): TableDefinition
{
$tableDefinition = new TableDefinition($this->getIdentifier());
$tableDefinition
->addTable(table: 'users', columns: [
// ...
// -> Read the value from the database and decode it
// into an associative array
// -> Replace some values in the array by hand with fake data
// -> Transform the array back into a JSON-String
// and save the data back to the database
Column::create('session_data', Column::DATA_TYPE_JSON)
->addDataProcessing(new DataProcessing(function (DataManipulatorContext $context): void {
// The plain data looks like this
// {"userName":"ronaldo15","email":"mcclure.ofelia@example.com","lastName":"Keeling","ip":"1321:57fc:460b:d4d0:d83f:c200:4b:f1c8"}
// Get the decoded data (from json to array)
$sessionData = $context->getDecodedData();
// Collect the original data, which must be pseudonymized
$userName = $sessionData['userName'];
$email = $sessionData['email'];
$lastName = $sessionData['lastName'];
$ip = $sessionData['ip'];
// Collect the pseudonymized data
$fakeUserName = $context->fake($userName)->userName();
$fakeEmail = $context->fake($email)->safeEmail();
$fakeLastName = $context->fake($lastName)->lastName();
$fakeIp = $context->fake($ip)->ipv6();
// Pseudonymize the `$sessionData` items
$sessionData['userName'] = $fakeUserName;
$sessionData['email'] = $fakeEmail;
$sessionData['lastName'] = $fakeLastName;
$sessionData['ip'] = $fakeIp;
// Replace the value in this column with the pseudonymized data
$context->setProcessedData($sessionData);
})),
])
;
return $tableDefinition;
}
}