Κυλλήνη

Быстрый генератор случайных имён-фамилий для базы данных

Относительно быстрый генератор псевдослучайного списка имён-фамилий и рандомных данных в базу. Фокус заключается в записи группы сгенерированных данных сперва во временную таблицу в памяти, потом блокировке основной таблицы и переносе данных туда из временной, после чего блокировка снимается, а цикл повторяется необходимое количество раз.

#!/usr/bin/perl

use strict;
use warnings;
use open ':std', ':encoding(UTF-8)';
use DBI;
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep
	clock_gettime clock_getres clock_nanosleep clock
	stat lstat );
use Math::Random::MT::Auto qw(rand irand shuffle gaussian),
	'/dev/urandom'; # Speeds up generation of random numbers. May cause errors on non-Unix systems

use constant TOTAL => 650000; # Total number of lines to be inserted
use constant PERSTEP => 1000; # Size of temporary table (speeds up everything). Values between 200 and 2000 are tested
use constant VERBOSE => 1; # Set 1 to generate more output

my $db_host = 'localhost';
my $db_user = 'tester';
my $db_name = 'testbase';
my $db_pass = 'your_password';

my $names_file = 'name_utf.txt'; # Text file with list of names. One per line
my $lastnames_file = 'family_utf.txt'; # Text file with list of surnames. One per line

my $db_string = "DBI:mysql:dbname=$db_name;host=$db_host";
my $dblink = DBI->connect(
	$db_string,
	$db_user,
	$db_pass,
	{PrintError => 1}
) or die "Cannot connect to database: $@";


# -----------------------------------------------------------------------

sub geninfo
{
	my @names;
	my @lastnames;

	my $totalt0 = [gettimeofday]; # Counting total time from this moment

	# Populate the names array
	open(my $fh, '<:encoding(UTF-8)', $names_file)
	or die "Could not open file '$names_file' $!";
	while (my $row = <$fh>)
	{
		chomp $row;
		push (@names, $row);
	}
	close $fh;

	# Populate the surnames array
	open($fh, '<:encoding(UTF-8)', $lastnames_file)
	or die "Could not open file '$lastnames_file' $!";
	while (my $row = <$fh>)
	{
		chomp $row;
		push (@lastnames, $row);
	}
	close $fh;

	# Creating temporary table to speed things up
	# Temporary table structure must be (almost) identical to target table structure. In this example ID column is omitted
	$dblink->prepare("CREATE TEMPORARY TABLE `persons_temp` (
		`name` varchar(24) NOT NULL,
		`secondname` varchar(24) NOT NULL,
		`rnum` int(8) DEFAULT NULL,
		`mtime` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' ON UPDATE CURRENT_TIMESTAMP
		) ENGINE=MEMORY")->execute();

	my $query = "INSERT INTO `persons_temp` (`name`, `secondname`, `rnum`, `mtime`) VALUES (?, ?, ?, FROM_UNIXTIME(?))";
	my $statement = $dblink->prepare($query);

	for (my $num = 0; $num < TOTAL;)
	{
		my $t0 = [gettimeofday]; # Counting one iteration time from this moment

		$dblink->prepare("DELETE FROM `persons_temp`")->execute(); # Empty temporary table first (it's filled with previous iteration results)
		for (my $stepnum = 0; $stepnum < PERSTEP and $num < TOTAL; $stepnum++, $num++)
		{
			my $pname = ucfirst($names[rand(0+@names)]); # Take random name and uppercase first letter
			my $psname = ucfirst($lastnames[rand(0+@lastnames)]); # Take random surname and uppercase first letter
			my $rnd = int(rand(999999999)); # Random number
			my $dt = time; # Timestamp

			$statement->execute($pname, $psname, $rnd, $dt); # Inserting one row to temporary table
			print "Adding record number " . ($num + 1) . ": $pname $psname $rnd\n" if VERBOSE;
		}

		# Copy data from temporary table to destination table FAST
		$dblink->prepare("LOCK TABLE `persons` WRITE, `persons_temp` READ")->execute();
		$dblink->prepare("INSERT INTO `persons` (`name`, `secondname`, `rnum`, `mtime`) SELECT * FROM `persons_temp`")->execute();
		$dblink->prepare("UNLOCK TABLES;")->execute();

		print "Step takes: " . tv_interval ( $t0 ) . "s\n";
	}

	my $interval = timeinterval(tv_interval($totalt0)); # Convert total time to human-readable format with some damn creepy function (below)
	print "\nJob done in $interval\n";

	undef($query);
	$dblink->prepare("UNLOCK TABLES;")->execute();
	$dblink->prepare("ALTER TABLE `persons` ENABLE KEYS;")->execute();
}

sub timeinterval
{
	my( $seconds ) =  (@_ );

	if ( $seconds < 60 )
	{
		# less than a minute
		return( $seconds . "s" );
	}
	if ( $seconds <= ( 3600 ) )
	{
		# less than an hour
		return( int($seconds/ 60 ) . "m:" . timeinterval($seconds%60) );
	}
	if ( $seconds <= ( 86400 ) )
	{
		# less than a day
		return( int($seconds/3600) . "h:" . timeinterval($seconds%3600) );
	}
	if ( $seconds <= ( 60 * 60 * 24 * 7 ) )
	{
		# less than a week
		return( int($seconds/(86400) ) . "d, " . timeinterval($seconds%86400) );
	}

	# fall-back weeks
	return( int( $seconds/(60*60*24*7)) . " weeks" );
}


# -----------------------------------------------------------------------

geninfo();
$dblink->disconnect();
__END__

Пример вывода:

...
...
Adding record number 649995: Мануил Рыбнов 966212308
Adding record number 649996: Евстигней Девятков 513734457
Adding record number 649997: Бадри Колмыченко 396553781
Adding record number 649998: Август  Хлудев 3156068
Adding record number 649999: Бронислав  Воскресенский 309781404
Adding record number 650000: Мелентий Веретин 19724141
Step takes: 0.156238s

Job done in 3m:8s

Использовалась такая таблица:

CREATE TABLE `persons` (
  `id` int(16) unsigned NOT NULL AUTO_INCREMENT,
  `name` varchar(24) NOT NULL,
  `secondname` varchar(24) NOT NULL,
  `rnum` int(8) DEFAULT NULL,
  `mtime` timestamp NOT NULL DEFAULT '0000-00-00 00:00:00' ON UPDATE CURRENT_TIMESTAMP,
  PRIMARY KEY (`id`),
  KEY `FULLNAME` (`name`,`secondname`)
) ENGINE=InnoDB AUTO_INCREMENT=664951 DEFAULT CHARSET=utf8;

Само собой, скрипт нужно перепилить в районе CREATE TEMPORARY TABLE (и несколько последующих строк), чтобы он удовлетворял вашей схеме.