英文:
Why Java can run the code without cost of time?
问题
以下是代码的翻译部分:
public class JavaSF_OLD {
// ...(变量定义部分省略)
public static long get() {
var current = 164635438;
if (current != lastTimeStamp) {
snowFlakeId = 0;
lastTimeStamp = current;
} else {
snowFlakeId++;
}
long id = 0;
id |= current & dateTimeMask;
id <<= dataCenterNumberShiftBits;
id |= DataCenterID & dataCenterNumberMask;
id <<= machineNumberShiftBits;
id |= MachineID & machineNumberMask;
id <<= randomNumberShiftBits;
id |= snowFlakeId & randomNumberMask;
return id;
}
public static void main(String[] args) {
long result = 0;
for (int out = 0; out < 10; out++) {
var start = System.currentTimeMillis();
for (int i = 0; i < 1000000000; i++) {
result = get();
}
var end = System.currentTimeMillis();
System.out.println(end - start);
System.out.println(result);
}
}
}
#include <iostream>
#include <chrono>
// ...(变量定义部分省略)
std::int64_t get() {
auto current = 164635438;
if (current != lastTimeStamp) {
snowFlakeId = 0;
lastTimeStamp = current;
} else {
snowFlakeId++;
}
unsigned long long id = 0;
id |= static_cast<unsigned long long>(static_cast<unsigned long long>(current) & dateTimeMask);
id <<= dataCenterNumberShiftBits;
id |= static_cast<uint>(static_cast<uint>(DataCenterID) & dataCenterNumberMask);
id <<= machineNumberShiftBits;
id |= static_cast<uint>(static_cast<uint>(MachineID) & machineNumberMask);
id <<= randomNumberShiftBits;
id |= static_cast<uint>(snowFlakeId & randomNumberMask);
return id;
}
int main() {
for (int out = 0; out < 10; out++) {
uint64_t result = 0;
auto start = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
for (int i = 0; i < 1000000000; i++) {
result = get();
}
auto end = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
std::cout << (end - start) << std::endl;
std::cout << result << std::endl;
}
return 0;
}
注意:这里只是代码的翻译,不包括您提到的结果分析以及关于Java和C++版本性能优化的讨论。
英文:
I have written a little program to generate the unique ID. And print out the cost of time. Here is the code:
public class JavaSF_OLD {
static int randomNumberShiftBits = 12;
static int randomNumberMask = (1 << randomNumberShiftBits) - 1;
static int machineNumberShiftBits = 5;
static int machineNumberMask = (1 << machineNumberShiftBits) - 1;
static int dataCenterNumberShiftBits = 5;
static int dataCenterNumberMask = (1 << dataCenterNumberShiftBits) - 1;
static int dateTimeShiftBits = 41;
static long dateTimeMask = (1L << dateTimeShiftBits)-1;
static int snowFlakeId = 0;
static long lastTimeStamp = 0;
static int DataCenterID = 1;
static int MachineID = 1;
public static long get() {
// var current = System.currentTimeMillis();
var current = 164635438;
if (current != lastTimeStamp) {
snowFlakeId = 0;
lastTimeStamp=current;
}else{
snowFlakeId++;
}
long id = 0;
id |= current&dateTimeMask;
id <<= dataCenterNumberShiftBits;
id |= DataCenterID&dataCenterNumberMask;
id <<= machineNumberShiftBits;
id |= MachineID&machineNumberMask;
id <<= randomNumberShiftBits;
id |= snowFlakeId & randomNumberMask;
return id;
}
public static void main(String[] args) {
long result = 0;
for (int out = 0; out < 10; out++) {
var start = System.currentTimeMillis();
for (int i = 0; i < 1000000000; i++) {
result = get();
}
var end = System.currentTimeMillis();
System.out.println(end - start);
System.out.println(result);
}
}
}
The result seems to be a little wierd.
53
690531076282879
5
690531076281343
0
690531076283903
0
690531076282367
0
690531076280831
0
690531076283391
0
690531076281855
0
690531076284415
0
690531076282879
0
690531076281343
It use 0 millionsecond to get the right result, while the C++ version needs 230 millionseconds to get one result. When I change the number of the inner loop from 1000000000 to 1e9, which is of type double, it takes more than one second to get per result. How could this be?
I change the number of loop of C++ version and there is no change at all. So I guess Java optimizes the loop and omit the first 999999999 loops. And how could Java optimize it actually and run it at no cost but get the correct result? And how to optimize C++ version of the same code to skip the useless loop? I use -O3 flag but it seems not working.
#include <iostream>
#include <chrono>
static const unsigned int randomNumberShiftBits = 12;
static const unsigned int randomNumberMask = (1u << randomNumberShiftBits) - 1;
static const unsigned int machineNumberShiftBits = 5;
static const unsigned int machineNumberMask = (1u << machineNumberShiftBits) - 1;
static const unsigned int dataCenterNumberShiftBits = 5;
static const unsigned int dataCenterNumberMask = (1u << dataCenterNumberShiftBits)-1;
static const unsigned int dateTimeShiftBits = 41;
static const unsigned long long dateTimeMask = (1ull << dateTimeShiftBits) - 1;
static uint32_t snowFlakeId = 0;
static unsigned long long lastTimeStamp = 0;
static unsigned int DataCenterID=1;
static unsigned int MachineID=1;
std::int64_t get() {
// auto current = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
auto current = 164635438;
if (current != lastTimeStamp) {
snowFlakeId = 0;
lastTimeStamp = current;
}else{
snowFlakeId++;
}
unsigned long long id = 0;
// Datetime part
id |= static_cast<unsigned long long>(static_cast<unsigned long long>(current) & dateTimeMask);
// DataCenter Part
id <<= dataCenterNumberShiftBits;
id |= static_cast<uint>(static_cast<uint>(DataCenterID)&dataCenterNumberMask);
// Machine Part
id <<= machineNumberShiftBits;
id |= static_cast<uint>(static_cast<uint>(MachineID)&machineNumberMask);
// Random Number Part
id <<= randomNumberShiftBits;
id |= static_cast<uint>(snowFlakeId&randomNumberMask);
return id;
}
int main() {
for (int out = 0; out < 10; out++) {
uint64_t result = 0;
auto start = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
for (int i = 0; i < 1000000000; i++) {
result = get();
}
auto end = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch()).count();
std::cout << (end - start) << std::endl;
std::cout<<result<<std::endl;
}
return 0;
}
This is the C++ version and the result of it:
1419
690531076282879
1385
690531076281343
1388
690531076283903
1457
690531076282367
1407
690531076280831
1402
690531076283391
1441
690531076281855
1389
690531076284415
1395
690531076282879
1360
690531076281343
As for measuring the time, it's just the code in the main function. I know the algorithm is wrong and I am just curious why Java could do that and how to make C++ skip the loop as well.
答案1
得分: 5
你对很多误解进行了操作。
以 System.currentTimeMillis() 作为 ID 是一个糟糕的想法
这是一个非常糟糕的想法。你的代码明显地设计成将 System.currentTimeMillis(cTM
)视为严格递增的序列。例如,如果当前时间是 10000,我要求一个 id,我会得到 10000:0
。如果我再次请求,我会得到 10000:1
。如果时间变为 10001,我会得到 10001:0
,如果时间又回到 10000,我会再次得到 10000:0
,违反了生成唯一数字的意图。
但问题在于:cTM
绝对不能保证严格递增。
cTM 反映了系统时钟。在一些老旧的系统上,系统时钟表示的是本地时间,而不是 UTC。Java 应该会“修复”这一点,但在夏令时调整期间,时间会回退 3600000 毫秒(一小时的时间)是可能发生的。更一般地说,大多数计算机从某个网络源获取时间,并且会一直根据几秒钟的间隔(这在容易达到几千毫秒)调整时间。如果你必须拥有唯一的 ID 并且系统时间是唯一可能的提供者,那么是有解决方案的,但整个博士研究论文都已经写过如何做到这一点(这被称为“涂抹”),而你的计算机很可能没有在执行,JVM 只是回报操作系统告诉它的内容,因此它也不会涂抹。
System.nanoTime() 更或多或少地保证递增,但它大约每 36 天左右会循环一次。如果你想要唯一的 ID,使用正确的工具:UUID。生成唯一的 ID 比你想象的要困难,但是这个问题已经有解决方法。使用现有的解决方案。
用 cTM 测量性能是一个糟糕的想法
这也是错误的。Java 的工作大致如下:
非常缓慢而愚蠢地运行所有代码,并跟踪各种完全不相关的信息,比如“对于这个 if 分支,表达式解析为 'true' 和 'false' 的频率有多高”,或者“这个方法被调用了多少次”。收集这些统计信息会使它变得更加低效。JVM 现在变得极其低效。但这是好的,你马上就会明白。与 C 代码形成对比,你使用的 gcc
或其他编译器会对源代码进行详细分析,并生成尽可能优化的机器代码,但仅此而已:没有记录。它从编译停止开始就是优化的代码。相对于 Java 来说,javac
非常简单且相当愚蠢,几乎不进行任何优化。实际运行时的优化由 java
在运行时执行。
然后,不时地进行分析:系统中所有方法中,哪一个占用了最多的 CPU 时间?然后,花一些时间并使用所有这些看似无用的统计数据生成一个非常精细调校的优化过的机器代码版本。它可以并且经常会比手写代码表现得更好;毕竟,Java 能够实时了解实际工作负载的行为,而像 C 编写的代码无法了解这一点。Java 甚至可以生成带有内置假设的代码,因为如果假设的一个失败后,Java 可以“作废”这个优化的变体。
总之,再次过于简化,任何给定方法的一般性能特征是,在一段时间内每次运行花费 X 时间(假设为 1000),然后一个调用会花费更长时间,因为系统进行了分析(假设为 10000000),然后所有后续的调用花费 Y 时间,其中 Y 远远小于 X(假设为 10)。
1000 次循环的次数和重新编译时的一次尖峰,是“常数”,然后实际的 10 时间适用于所有后续循环。随着应用越来越多的循环次数(因为我们只优化经常调用的方法,10 次循环超过了其他的循环),对于性能目的来说,10 才是唯一重要的数字。
但这意味着你需要在测量性能之前等待这种情况发生,这一点一点都不容易。你还会遇到其他的“噪音”。也许你的线程被 Winamp 抢占,因为它需要解压一些 MP3 文件,导致你的计时中出现了巨大的尖峰。
答案是 JMH
。再次强调一个问题:手头的任务(测量方法调用的时间)比你想象的要复杂数个数量级,但这是一个解决方案,所以使用现有的解决方案。
对于你观察到的性能的一些估计
如果你将其设置为 double,那么你必须在 double 上添加 1,这可能会慢上数个数量级,就像 double 的比较一样。最终,你的方法将永远运行下去(如果
英文:
You're operating under a ton of misconceptions.
System.currentTimeMillis() as ID is a bad idea
This is a very bad idea. your code is clearly designed to treat System.currentTimeMillis (cTM
) as a strictly increasing sequence. For example, if the current time is 10000, and I ask for an id, I get 10000:0
. If I ask again, I'd get 10000:1
. If time then becomes 10001, I'd get 10001:0
, and if time then warps back to 10000, I'd get 10000:0
again, violating the intent of generating unique numbers.
But here is the thing: cTM
has absolutely no guarantee that it is strictly increasing.
cTM reflects the system clock. On some backward systems, the system clock represents local time and not UTC. Java is supposed to 'fix' that, but getting time warped back 3600000 milliseconds (one hour's worth) is known to happen during daylight savings adjustments. More generally, most computers get the time from some network source and will adjust time by a few seconds (which is THOUSANDS of milliseconds easily) all the time. There are solutions if you must have unique IDs and system time is the only possible provider but entire PhD research papers have been written about how to do it (it's called 'smearing', and your computer is probably not doing it, and the JVM just reports back what the OS is telling it, so it won't smear either).
System.nanoTime() is more or less guaranteed to increase, but it loops back around every 36 days or so. If you want unique IDs, use the right tool: UUIDs. Generating unique IDs is harder than you think, and a solved problem. Use the existing solution.
Timing performance with cTM is a bad idea
That's wrong too. Java works very roughly as follows:
Run all code very slowly and stupidly, and keep track of all sorts of completely irrelevant information, such as 'for this if branch, how often does the expression resolve to 'true' vs. 'false'', or 'how often is this method called'. Gathering these stats makes it even more inefficient. The JVM is now extremely inefficient. But this is good, you'll see in a moment. Contrast to C code where gcc
or whatever compiler you use will analyse the heck out of your source code and make the most optimized machinecode that it can, but that's where it ends: No bookkeeping. It's optimized code right from the compile stop. Vs. java; javac
is very simple and quite stupid, it does almost no optimization. It's java
itself, at runtime, that does.
Then, from time to time, do an analysis: Which of alllll the methods in the system, is taking up the most CPU time? Then, take a moment and all those seemingly useless stats to generate one heck of an amazing fine-tuned optimized machine-coded version of this method. It can and often will outperform handwritten code; after all, java has the benefit of knowing the behaviours in real time, for this actual workload, whereas something like C-written code can't know that. Java even gets to generate code with assumptions built in, because java can 'invalidate' this optimized variant if one of the assumptions fails later on.
The upshot is that, again, oversimplifying by a lot, the general performance characteristic of any given method is that it takes X time per run for a while (X being, say, 1000), then one call that takes waaaay longer as the system does the analysis (say, 10000000), and then all further invokes takes Y time, where Y is much, much less than X (say, 10).
The # of cycles at 1000, and that one blip as it recompiled, are 'constant', and then the actual time of 10 is for all further cycles. As more and more cycles are applied (and as we only optimize often-called methods, the 10 cycles dwarf the others), the 10 is the only important number for performance purposes.
But it does mean you need to wait until that happened before you measure performance, and that is not easy at all. You also get other 'noise'. Maybe your thread gets pre-empted by winamp because it needs to unpack some more of that MP3 file, causing a huge blip on your timing arbitrarily.
The answer is JMH
. Yet again, some problem: The job at hand (timing a method call) is orders of magnitude more complicated than you think it is, but it is a solved problem, so use the existing solutions.
Some guesstimates about your observed performance
If you make that a double, then you have to add 1 to a double, which can be orders of magnitude slower, as is double comparisons. Eventually your method will just run forever (if you go to large numbers, x+1 is just x, in double land. Think about it: Doubles are 64-bit, so can only represent at most 2^64 different numbers. And yet a double can do, say, 1e308. How can you fit 1e308 pigeons in only 2^64 holes? The answer is: You can't. Not every number between 0 and infinite is representable as a double, and when you try to set something to a number not in that space of 2^64 representables, java silently rounds to the nearest. Eventually the gap between representables exceeds 1.0, and at that point, i++
fails to make any changes to i. It's not quite at 1e9 (it's I think, at around 2^53), but doing increment counting with doubles is always a bad idea. Go with long
if you must.
Furthermore, both C and java (but not javac, that hotspot analyser I talked about in the second point) have 'optimizers'. If the optimizer realizes that [A] you don't actually use the result of get()
anywhere in your code, and [B] the get() method either has no side effects at all, or the side effects can be fully covered by only running a fraction of the total instructions in get(), then the optimizer is free to just NOT run that method, or at least run only parts of it, which would cause wildly different performance measurements.
JMH solves this too: For example, it FORCES you to return some numeric value in your measurement method, as JMH will mix this number into a value it keeps track of, thus forcing the optimizer to realize it can't just 'optimize' by skipping the entire call!
cTM is not free
The System.currentTimeMillis()
is, or can be, quite expensive. C as a language makes almost no promises about anything (it doesn't even promise that an int
is 32-bit!), but any particular given library impl tends to make extremely specific promises about what a given call does. Java lies in the middle. It means that what java actually ends up executing at the OS level when you run cTM
may be different, and involve some caches + using the CPU core's own internal clock which is many orders of magnitude faster than 'asking for the system time', whereas the C call farms out the work to the system time every time you call it, because the C code assumes that if you want to optimize and estimate with CPU core updates, that you'd then program that or fetch a library that will. You're (potentially) mostly timing the performance of cTM here and not of your algorithm, and between the C and the java code cTM may have wildly different implementations. You're comparing guns to grandmas, in other words.
JMH, as usual, helps you out here, and avoids the issues with cTM. Not that I know of a way to compare JMH results with C results, but at least a JMH timing result is something you can trust a heck of a lot more than handspun deltas between cTM calls.
cTM is not as stable as you think it is
cTM sucks. The problem is: Clocks are really hard. I know, I know, you can go to the store, buy a 5 cent watch with some cheap crystal in there and it's surprisingly accurate. But the surface of a computer chip is an extremely inhospitable place, with wild temperature swings, electrons flowing all over the place, and a ton of air being moved nearby. Trying to keep a quartz crystal stable in those conditions is tricky. So either the system clock is far away from the CPU but now asking for the system time is incredibly expensive compared to basic instructions (literally hundreds of thousands of cycles, as the electrons make their way, like slow molasses, over cable of many centimeters long, an eternity in computer CPU terms), or it's on board (and they are), and not as stable as you'd like.
CPU cores have internal clocks which can be more stable but are more unbound from reflecting any actual time, and cause serious issues if your code is moved to another core which has a completely different core clock. Java gives you access to this - System.nanoTime
, and even tries to smooth out the core hop issues, but as is the theme in this answer: Time is way, way harder than you think it is, but fortunately it is a mostly solved problem. Note how nanoTime intentionally returns a meaningless number: It has meaning only in relation to other calls to nanoTime, it doesn't mean anything by itself (whereas cTM means: millis since midnight 1970-1-1 UTC). It's tricky - JMH solved the problem, you should use that.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论