435 lines
48 KiB
TeX
Raw Permalink Normal View History

%%%%%%%%%%%%%%%%%%%% author.tex %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% sample root file for your "contribution" to a proceedings volume
%
% Use this file as a template for your own input.
%
%%%%%%%%%%%%%%%% Springer %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\documentclass{svproc}
%
% RECOMMENDED %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% to typeset URLs, URIs, and DOIs
\usepackage{url}
\def\UrlFont{\rmfamily}
\usepackage{amsmath}
\usepackage{amssymb}
\newcommand{\erf}{\mathrm{erf}}
\usepackage{booktabs}
\usepackage{graphicx} % 插入图片
\usepackage{caption} % 自定义图注格式(可选)
\usepackage{subcaption} % 多子图(如有子图时使用)
\usepackage{float} % 强制图像位置控制,如 [H]
\begin{document}
\mainmatter % start of a contribution
%
\title{Anomalous Crowd Gathering Prediction Method Based on Spatial-Temporal Graph Convolutional Network in Multi-Camera Surveillance Systems}
%
%\titlerunning{Hamiltonian Mechanics} % abbreviated title (for running head)
% also used for the TOC unless
% \toctitle is used
%
%\author{JiaWei Wang\inst{1} \and Ye Li\inst{1} \and
%Li Zhan \and Nan Zhou }
%\author{JiaWei Wang}
%
%\authorrunning{Ivar Ekeland et al.} % abbreviated author list (for running head)
%
%%%% list of authors for the TOC (use if author list has to be modified)
%\tocauthor{Ivar Ekeland, Roger Temam, Jeffrey Dean, David Grove,
%Craig Chambers, Kim B. Bruce, and Elisa Bertino}
%
\institute{University of Electronic Science and Technology of China,\\
\email{I.Ekeland@princeton.edu}
%\and
%Universit\'{e} de Paris-Sud,
%Laboratoire d'Analyse Num\'{e}rique, B\^{a}timent 425,\\
%F-91405 Orsay Cedex, France
}
\maketitle % typeset the title of the contribution
\begin{abstract}
%The abstract should summarize the contents of the paper
%using at least 70 and at most 150 words. It will be set in 9-point
%font size and be inset 1.0 cm from the right and left margins.
%There will be two blank lines before and after the Abstract.
Urban surveillance systems face inherent limitations in monitoring complex crowd dynamics due to the restricted coverage of single-camera setups. This study proposes a novel Spatial-Temporal Graph Convolutional Network framework for predicting abnormal crowd aggregation. Our method introduces a composite anomaly aggregation metric that synthesizes three critical factors: the spatial distribution of abnormal groups (core anomaly intensity), ambient pedestrian flow variations (environmental sensitivity), and suppression mechanisms for regular large-scale gatherings. By constructing topological graphs based on camera networks and performing spatio-temporal convolution operations, the model effectively integrates multi-view information to identify latent risk areas. Combining the camera topology structure and the spatio-temporal graph convolutional network, this method can accurately predict abnormal aggregation points in the spatial and temporal dimensions, and effectively identify potential abnormal risk areas through multi-camera information fusion. \dots
% We would like to encourage you to list your keywords within
% the abstract section using the \keywords{...} command.
\keywords{Spatio-temporal graph convolutional network,
Anomalous crowd prediction,
Multi-camera surveillance}
\end{abstract}
\section{Introduction.}
%With the rapid advance of urbanization and the growing demand for public safety, the deployment of surveillance cameras in urban environments has increased dramatically. These cameras provide realtime capture and analysis over large geographic areas, yet each device remains constrained by its limited field of view and occlusions. Consequently, once an intelligent detection system flags anomalous behavior using a single camera, it is still challenging to infer where the affected individuals or crowds will converge. Bridging this gap requires integrating trajectories from multiple cameras and predicting the final anomaly aggregation points.
%
%Recent research in intelligent security has increasingly focused on multicamera systems, aiming to enhance target tracking and anomaly detection through collaboration and information fusion. For example, \emph{MultiCamera Tracking and Anomaly Detection: A Review} surveys methods for associating observations across views and fusing detection outputs, while \emph{Deep Learning for MultiCamera Anomaly Detection} demonstrates that combining convolutional neural networks (CNNs) with recurrent neural networks (RNNs) improves both accuracy and timeliness of anomaly recognition. In these studies, constructing and exploiting the camera network topology—a graph whose nodes represent cameras and edges encode spatial relationships or fieldsofview overlap—has been shown to provide a global perspective that is essential for early warning of group events.
%
%However, existing approaches typically stop at detecting anomalies; they do not quantify the degree of crowd convergence nor predict where anomalies will concentrate. To address this, we introduce a novel metric, the \emph{anomaly aggregation degree}, which nonlinearly weights both abnormal and normal crowd flows, applying a saturation suppression mechanism to avoid false alarms in dense but benign gatherings. We model the camera network topology as a graph and embed the time series of aggregation degrees at each node into a SpatioTemporal Graph Convolutional Network. This multilayer fusion framework jointly captures spatial correlations and temporal dynamics, enabling accurate prediction of potential anomaly hotspots.
%
%In summary, our contributions are threefold: (1) we propose the anomaly aggregation degree, a unified index that quantifies deviation from normal crowd patterns; (2) we integrate this metric into a graph representation of camera topology, enabling global inference; and (3) we achieve high accuracy prediction of abnormal sink points in complex urban environments.
With the rapid advance of urbanization and the growing demand for public safety, the deployment of surveillance cameras in urban environments has increased dramatically.These cameras provide realtime capture and analysis over large geographic areas, yet each device remains constrained by its limited field of view and occlusions. Consequently, once an intelligent detection system flags anomalous behavior using a single camera, it is still challenging to infer where the affected individuals or crowds will converge. Bridging this gap requires integrating trajectories from multiple cameras and predicting the final anomaly aggregation points\cite{MultiCameraReview,DeepLearningMultiCam} .
\begin{figure}[H]
\centering
\includegraphics[width=0.95\textwidth]{camera_topology.pdf}
\caption{
Temporal evolution of abnormal aggregation density over 12 consecutive frames. Each node represents a surveillance camera in the simulated urban network, and the edge indicates physical connectivity or proximity between cameras. The color intensity of each node reflects the computed abnormal aggregation degree at that time step. Darker nodes indicate higher levels of abnormal crowd gathering. This visualization illustrates how potential anomaly hotspots evolve over time and migrate through the camera network.
}
\label{fig:aggregation-sequence}
\end{figure}
Recent research in intelligent security has increasingly focused on multicamera systems, aiming to enhance target tracking and anomaly detection through collaboration and information fusion. For example, MultiCamera Tracking and Anomaly Detection\cite{MultiCameraReview} : A Review surveys methods for associating observations across views and fusing detection outputs and Deep Learning for MultiCamera Anomaly Detection\cite{DeepLearningMultiCam} demonstrates that combining convolutional neural networks (CNNs) with recurrent neural networks (RNNs) improves both accuracy and timeliness of anomaly recognition. In these studies, constructing and exploiting the camera network topology—a graph whose nodes represent cameras and edges encode spatial relationships or fieldsofview overlap—has been shown to provide a global perspective that is essential for early warning of group events\cite{TopologyAwareMCN,LearningSpatialRelations}.
However, existing approaches\cite{SaturationSuppression,NonlinearWeightingAnomaly} typically stop at detecting anomalies; they do not quantify the degree of crowd convergence nor predict where anomalies will concentrate. To address this, we introduce a novel metric, the anomaly aggregation degree, which nonlinearly weights both abnormal and normal crowd flows, applying a saturation suppression mechanism to avoid false alarms in dense but benign gatherings. We model the camera network topology as a graph and embed the time series of aggregation degrees at each node into a SpatioTemporal Graph Convolutional Network\cite{STGCNTraffic}. This multilayer fusion framework jointly captures spatial correlations and temporal dynamics, enabling accurate prediction of potential anomaly hotspots.
In summary, our contributions are threefold: (1) we propose the anomaly aggregation degree, a unified index that quantifies deviation from normal crowd patterns; (2) we integrate this metric into a graph representation of camera topology, enabling global inference; and (3) we achieve high accuracy prediction of abnormal sink points in complex urban environments.
%
\section{Related Works.}
%
\subsection{Camera Topology Diagram.}
%In multi-camera systems, camera topology graphs play a crucial role as essential tools for describing spatial relationships between cameras and their field-of-view coverage. By constructing graph-theoretical models, camera topology graphs can effectively represent connection relationships between cameras, overlapping coverage areas, and information transmission paths. Each camera is represented as a node in the graph, while the spatial relationships and field-of-view coverage between cameras are connected through edges. This structure provides the system with a global perspective, facilitating multi-camera collaboration for target tracking and anomaly detection.
In multicamera systems, camera topology graphs play a crucial role as essential tools for describing spatial relationships between cameras and their fieldofview coverage.\cite{TopologyAwareMCN} By constructing graphtheoretical models, camera topology graphs can effectively represent connection relationships between cameras, overlapping coverage areas, and information transmission paths.\cite{LearningSpatialRelations} Each camera is represented as a node in the graph, while the spatial relationships and fieldofview coverage between cameras are connected through edges. This structure provides the system with a global perspective, facilitating multicamera collaboration for target tracking and anomaly detection.
The application of camera topology graphs in multi-camera systems primarily manifests in the optimization of information fusion and data sharing. Specifically, they enable the integration of data from different cameras, particularly playing a key role in cross-camera target tracking and abnormal behavior recognition. Furthermore, camera topology graphs have significant applications in group behavior analysis and event prediction. By correlating perspective information from multiple cameras, the topology graph can reveal group dynamics and promptly identify potential abnormal behaviors for early warning. For instance, when multiple cameras detect abnormal trajectories from a target, relationship analysis through the topology graph can quickly determine whether the target interacts with others, thereby enabling early warnings for group events.
\subsection{Graph Convolutional Neural Network.}
% Graph Neural Networks (GNNs) have become an important tool for processing non-Euclidean data. Among them, Graph Convolutional Networks (GCNs), as a classic GNN model, are widely applied in tasks such as node classification, graph classification, and link prediction. Kipf and Welling first proposed the GCN based on spectral methods, which effectively captures the local relationships between nodes by performing convolution operations on the graph structure. The core idea of GCN is to aggregate and propagate node information through the adjacency matrix to achieve efficient learning of the global graph structure. Its basic operation is defined as message passing through the product of the normalized adjacency matrix and the feature matrix, thereby obtaining node embeddings.
Graph Neural Networks (GNNs) have become an important tool for processing nonEuclidean data.\cite{GNNReview} Among them, Graph Convolutional Networks (GCNs), as a classic GNN model, are widely applied in tasks such as node classification, graph classification, and link prediction. Kipf and Welling first proposed the GCN based on spectral methods\cite{KipfWelling}, which effectively captures the local relationships between nodes by performing convolution operations on the graph structure. The core idea of GCN is to aggregate and propagate node information through the adjacency matrix to achieve efficient learning of the global graph structure.Its basic operation is defined as message passing through the product of the normalized adjacency matrix and the feature matrix, thereby obtaining node embeddings.
In recent years, improved models of GCN have emerged in an endless stream\cite{GG,DD,DG,MS,IG,MD,MO}. For instance, Graph Attention Networks (GAT)\cite{GAT} introduced an attention mechanism, enhancing the modeling ability of neighborhood information on heterogeneous graphs; GraphSAGE\cite{GraphSAGE} proposed a sampling-based neighborhood aggregation method, significantly improving computational efficiency on large-scale graph data. Additionally, applications of GCN have gradually expanded from traditional tasks to areas such as recommendation systems, social network analysis, and biological network analysis, demonstrating its powerful ability in handling complex network data. Compared with traditional machine learning models, GCN can capture the complex interactions between node features and topological information while preserving the graph structure information, thus having higher expressive power.
In the field of intelligent security, GCNs have been widely applied in the analysis of camera topology, especially in tasks such as abnormal behavior detection and target tracking in multi-camera surveillance systems\cite{AT,AG,AGL,AH}. By modeling the camera network as a graph structure, where nodes represent individual cameras and edges represent visual or spatial relationships between cameras, GCNs can efficiently learn and extract features of the camera network. This approach significantly improves the accuracy of abnormal behavior detection and enhances the performance of cross-camera target tracking.
\subsection{The Combination of Time Series Prediction and Graph Neural Networks.}
Current research combining Graph Neural Networks (GNNs) with time series prediction models has made significant progress in spatio-temporal data analysis, particularly in fields such as traffic prediction, environmental monitoring, and public safety. The ST-GCN\cite{STGCNTraffic} framework, by modeling spatial dependencies through graph convolution and integrating convolutional structures to effectively capture temporal dependencies, has significantly improved the accuracy of traffic flow prediction. TimeGNN\cite{TimeGNN}, through dynamic time graph representation, can capture the evolving patterns among multiple time series and has a faster inference speed than other graph-based methods while maintaining good predictive performance. StemGNN\cite{StemGNN} models the correlations between sequences through graph Fourier transform and captures temporal dependencies through discrete Fourier transform, demonstrating outstanding performance in multivariate time series prediction. In further optimizing spatio-temporal graph convolution models, Li et al.'s DyGraphformer\cite{DyGraphformer} model combines graph convolution with Transformer to dynamically infer time-varying spatial dependencies, achieving excellent performance in multivariate time series prediction. Dai et al.'s H-STGCN\cite{H-STGCN} model integrates online navigation data with graph convolution, improving the accuracy of traffic flow prediction, especially in the prediction of non-recurring congestion. The STS-GCN model\cite{STS-GCN}has made breakthroughs in the spatio-temporal dynamic modeling of human poses by decomposing the connections between space and time into spatial and temporal affinity matrices. Additionally, other studies such as Feng et al.'s GCNInformer model\cite{GCNInformer}, which combines graph convolution with Informer to optimize air quality prediction, has shown good stability in long-term predictions; Lira et al.'s GRAST-Frost model\cite{GRAST-Frost}, which combines graph neural networks with spatio-temporal attention mechanisms for frost prediction, has significantly improved prediction accuracy.The STAGCN model\cite{Stagcn} proposed by Ma et al. combines adaptive graph convolution and spatio-temporal convolution, and performs particularly outstandingly in multi-step traffic flow prediction.
\section{Method.}
\subsection{ Spatial-Temporal Graph Convolutional Network.}
The input of the model consists of a series of graph-structured data arranged in chronological order, where the nodes in the graph represent different spatial regions under various cameras, and each node carries the features of the corresponding region at the respective time. The edges between nodes in the graph represent spatial adjacency or functional association, which is used to describe the mutual influence and connection between different regions. The input data can be regarded as a graph tensor with a time dimension, where each frame graph contains the feature vectors of all nodes.
The model employs one-dimensional gated temporal convolution to model the dynamic evolution of the crowd across consecutive frames. This layer adaptively regulates the information flow through a gating mechanism (update gate and reset gate), sending the input features to the convolution branch and the gating branch respectively. The convolution branch generates candidate features, while the gating branch generates control signals. The two are element-wise multiplied to complete the feature update. The gated design can effectively suppress noise and irrelevant information, thereby highlighting the dynamic changes at key time steps. Stacking multiple layers of such gated convolutions not only expands the model's temporal receptive field but also enhances its ability to capture features at different time scales.
In the spatial domain, the model uses spectral graph convolution to extract the dependencies between adjacent nodes. Specifically, an approximation method based on Chebyshev polynomial expansion is used to approximate the graph Laplacian operator to the kth order polynomial, without the need for explicit eigenvalue decomposition. Each spectral graph convolution layer aggregates the features of the node itself and its kth-order neighbors through polynomial weighted summation, achieving multi-scale spatial information aggregation.
The overall network is composed of multiple stacked basic units of "gated temporal convolution - spectral graph convolution - gated temporal convolution". In each unit, the initial gated convolution layer extracts the temporal features of the nodes and filters out irrelevant fluctuations; then the spectral graph convolution layer aggregates neighborhood information and characterizes spatial dependencies; finally, another gated convolution layer further integrates high-level features across time. By cascading multiple such units, the model learns deeper spatio-temporal correlation representations layer by layer. Ultimately, the abnormal aggregation degree of each node within the future time window is obtained.
\subsection{Anomaly Aggregation Degree.}
In the field of public safety and crowd management, traditional monitoring systems often encounter the problem of distorted assessment in complex scenarios: methods based on absolute numbers or linear weighting are unable to distinguish between occasional anomalies and major risks, fluctuations in the base number of ordinary people can easily mask real abnormal aggregations, and fixed threshold strategies lack adaptability to dynamic environments. Therefore, this project separately calculates and weights the aggregations of abnormal and ordinary people, and develops a weighted algorithm based on a nonlinear coupling mechanism.
When studying crowd aggregation behavior, if only abnormal people are focused on, the risks caused by abnormal aggregations within the ordinary population may be overlooked; while if only the ordinary population is focused on, normal aggregations may be misjudged. Therefore, in order to more comprehensively and accurately assess the abnormality of crowd aggregations, this paper attempts to unify the behavioral characteristics of abnormal and ordinary people and construct a comprehensive quantitative indicator - abnormal aggregation degree.
The abnormal aggregation degree aims to measure the degree to which the crowd aggregation behavior in a specific area deviates from the normal pattern through multi-dimensional analysis of flow data, thereby providing a scientific basis for the prediction of abnormal points. Specifically, the design of this indicator needs to take into account the following two aspects: on the one hand, for the behavioral characteristics of abnormal people, a higher weight is assigned to highlight their potential risks; on the other hand, for the aggregation behavior of ordinary people, it is necessary to avoid misjudgments due to excessive sensitivity.
We divide our algorithm into core abnormal items, environmental sensitive items, and saturation suppression items to ensure effective differentiation between abnormal aggregations and regular behaviors, thereby enhancing the system's response capability.
Specifically, the mathematical expression of the core abnormal item is:
\begin{equation}
T_1 = \frac{N_{\text{anomaly}}^{\alpha}}{\beta + N_{\text{anomaly}}^{\alpha/2}}
\end{equation}
where $N_{\text{anomaly}}$ represents the number of anomalous individuals, $\alpha$ controls the nonlinear degree of the impact of the anomalous population size on aggregation degree, and $\beta$ serves as a balancing term to prevent the core anomaly degree from becoming overly sensitive or experiencing excessive amplification during small-scale anomaly occurrences.
Through the exponential weighting of $N_{\text{anomaly}}$, the impact of increasing anomalous population size on the core anomaly degree achieves nonlinear amplification. This design ensures that as the anomalous population grows, the risk of abnormal aggregation is appropriately
In the calculation of anomalous aggregation degree, the environmental sensitivity term is primarily employed to quantify the impact of aggregation behaviors within the normal population on the anomalous aggregation degree. The aggregation behaviors of the normal population are typically driven by routine social activities, occupational demands, or daily mobility. Even in densely populated environments, while these behaviors may induce certain density fluctuations, they do not directly trigger security risks. Therefore, when designing the anomalous aggregation degree, it is essential to prevent the system from overreacting to such routine behaviors, thereby maintaining its accuracy and robustness.
To achieve this objective, the environmental sensitivity term adopts a logarithmically weighted form, with its mathematical expression formulated as:
\begin{equation}
T_2 = \ln\left(1 + \frac{N_{\text{normal}}}{\gamma}\right)
\end{equation}
where $N_{\text{normal}}$ denotes the normal population flow, and $\gamma$ is the regulatory parameter that controls the degree of influence exerted by the aggregation behaviors of the normal population on the anomalous aggregation degree.
The introduction of this logarithmic function ensures that when the normal population flow becomes large, the sensitivity of anomalous aggregation degree to normal population aggregation gradually diminishes, thereby preventing excessive system reactions induced by routine population gatherings.
From a mathematical perspective, the design principle of the environmental sensitivity term is grounded in the smoothing treatment of routine population aggregation behaviors. As $N_{\text{normal}}$ increases, $\ln\left(1 + \frac{N_{\text{normal}}}{\gamma}\right)$ asymptotically approaches a plateau, indicating that the system's responsiveness to large-scale normal population gatherings gradually diminishes. This mechanism effectively mitigates oversensitivity to routine aggregation behaviors in high-traffic environments, thereby reducing the likelihood of false alarms.
The introduction of the parameter $\gamma$ endows the system with flexibility for scenario-specific adaptations. In high-traffic environments, appropriately increasing $\gamma$ reduces the contribution of normal population aggregation to the anomalous aggregation degree, preventing excessive system reactions to daily crowd fluctuations. Conversely, in low-traffic or specialized scenarios, decreasing $\gamma$ enhances the system's sensitivity to anomalous aggregation behaviors, ensuring timely detection of irregularities.Through this design, the environmental sensitivity term achieves a balanced response to aggregation behaviors of the normal population, preventing false alarms during large-scale routine gatherings while ensuring that anomalous behaviors remain detectable in low-traffic or specialized scenarios. This mechanism guarantees that the anomalous aggregation degree precisely quantifies the actual risk of abnormal crowd aggregation in dynamic and complex environments.
The saturation suppression term achieves additional smoothing of contributions from large-scale normal population aggregation, ensuring that under extreme crowd flow conditions the system does not overreact to routine aggregation behaviors. Its mathematical formulation is expressed as:
\begin{equation}
T_3 = \frac{N_{\text{anomaly}} \, \erf\left(\frac{N_{\text{normal}}}{\nu}\right)}{\sqrt{1 + N_{\text{normal}}}}
\end{equation}
where \(N_{\text{anomaly}}\) denotes the anomaly crowd flow, \(N_{\text{normal}}\) denotes the normal crowd flow, \(\nu\) is the parameter controlling the saturation effect intensity, and \(\erf(x)\) is the error function, defined as:
\begin{equation}
\erf(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} \, dt
\end{equation}
\(\erf(x)\) plays a key role in this design. Its properties enable the system to react strongly to smallscale normal crowd gatherings, while its effect gradually saturates as the normal crowd flow increases. Specifically, when \(N_{\text{normal}}\) is small, the ratio \(N_{\text{normal}}/\nu\) is low, and \(\erf(N_{\text{normal}}/\nu)\) grows approximately linearly with \(N_{\text{normal}}\), thereby amplifying the influence of the anomaly crowd. Conversely, as \(N_{\text{normal}}\) becomes large, \(\erf(N_{\text{normal}}/\nu)\) approaches 1, indicating that the normal crowds impact has reached its maximum. At this stage, the denominator \(1 + N_{\text{normal}}\) further attenuates the contribution of normal flow to the anomaly aggregation degree, ensuring that in highdensity scenarios the system does not overreact.
The saturation suppression term achieves a desirable balance in complex crowd behavior contexts: on one hand, it guarantees prompt response to anomalous aggregation under low crowd density; on the other hand, when crowd flow is high, the systems sensitivity to normal gatherings diminishes, thereby avoiding false alarms in inherently dense environments such as shopping malls or transit hubs. Through this nonlinear weighting, the system effectively distinguishes true anomalous aggregation from normal crowd behavior, enhancing both detection accuracy and robustness.
Furthermore, the introduction of the parameter \(\nu\) provides flexibility across different settings. A smaller \(\nu\) increases sensitivity to normal crowd gatherings, whereas a larger \(\nu\) makes the system more tolerant in highdensity environments. Thus, \(\nu\) can be tuned according to specific application requirements to achieve optimal anomaly aggregation degree prediction.
Hence, we obtain the complete weighted algorithm:
\begin{equation}
T = \underbrace{\frac{N_{\text{anomaly}}^{\alpha}}{\beta + N_{\text{anomaly}}^{\alpha/2}}}_{T_1 \text{(core anomaly intensity)}} + \underbrace{\ln\left(1 + \frac{N_{\text{normal}}}{\gamma}\right)}_{T_2\, \text{(environmental sensitivity)}} + \underbrace{\frac{N_{\text{anomaly}} \, \erf\left(\frac{N_{\text{normal}}}{\nu}\right)}{\sqrt{1 + N_{\text{normal}}}}}_{T_3\, \text{(saturation suppression)}}
\end{equation}
Overall, the weighting design of the anomaly aggregation degree innovatively combines nonlinear weighting, saturation suppression, and adaptive adjustment mechanisms, enabling precise discrimination between anomalous and routine aggregation behaviors across scenarios of varying scale and complexity. By appropriately allocating weights to anomalous and normal crowds, the system maintains efficient responsiveness in dynamic environments while avoiding false positives and excessive reactions.
\section{Experiment.}
To conduct an in-depth study of the spatial aggregation of normal and abnormal populations within urban road networks, this research constructs and operates a high-precision simulation environment based on a regular grid on a high-performance computing platform.
The simulation servers are equipped with two systems: one features an Intel i9 11900KF processor, 128 GB DDR4 memory, and an NVIDIA RTX 4090.
The simulation environment uses a 2.5m x 2.5m grid as the smallest cell unit. Every 4x4 grids (10m x 10m) are merged and defined as the smallest building unit, to ensure consistency in model scale.The entire area is divided into road systems and building zones:
Major roads (main streets) are 6 grid cells wide (15 m), designed as two-way four-lane roads;
Secondary roads (medium streets) are 3 grid cells wide (7.5 m), set as one-way dual-lane roads;
Tertiary roads (small lanes) are 1 grid cell wide (2.5 m), used for microscopic movement between buildings.
Building units are categorized into three sizes: small (4 grid cells), medium (16 grid cells), and large (36 grid cells). These buildings are randomly distributed within the road gaps, ensuring road connectivity without any blockage.
\begin{figure}[htbp]
\centering
\includegraphics[width=0.65\textwidth]{figs/simulator}
\caption{
The simulation visualization interface for crowd aggregation; grey areas represent roads, red dots indicate abnormal gathering crowds, and blue dots represent normal pedestrians. The larger red markers are the destinations of the gatherings.
}
\label{fig:aggregation-sequence}
\end{figure}
%该仿真环境以 2.5m×2.5m 为最小栅格单元,每 4×4 栅格10m×10m合并定义为最小建筑单元以确保模型尺度一致性。整个区域划分为道路系统与建筑区主干道大路宽度为 6 格栅15m设为双向四车道次干道中路宽度为 3 格栅7.5m设为单向双车道支路小路宽度为 1 格栅2.5m用于建筑间微观通行。建筑单元分为小4 格栅、中16 格栅和大36 格栅)三种规模,并随机分布于道路空隙,保证道路连通且无阻断。
On this spatial structure, fixed cameras are installed at various road intersections and key sections along the roads, with a field of view covering 4x4 grid cells (10m x 10m). These cameras generate spatiotemporal traffic data by real-time counting of individuals within their coverage area.Normal pedestrians (blue) randomly appear on the sides of the roads, with randomly assigned destinations such as road ends or building entrances, simulating the movement of regular pedestrians.
Abnormal pedestrians (red) are also generated on the roadside but aim for predetermined gathering points. Their path decision-making has different probabilities for choosing major roads, secondary roads, and tertiary roads, set at 0.7, 0.2, and 0.1 respectively. Additionally, Gaussian noise is introduced into their movements to simulate irregular walking patterns.
As the simulation progresses, abnormal pedestrians gradually converge at the gathering points, creating a high-density aggregation effect. This setup allows for the study of crowd dynamics and the identification of unusual congregation behaviors in urban environments.
This simulation program generates controlled normal and abnormal crowd data using detailed grid division, multi-level road-building layouts, and clear pedestrian movement rules. The output, including camera flow data and gathering point density curves, serves as direct training and validation datasets for spatiotemporal graph convolutional network models.
To validate the proposed weighted abnormal aggregation index, we simulate three typical abnormal crowd behaviors: incidental group behavior, protest marches, and urban riots. Each scenario includes normal pedestrian flows and controlled introduction of abnormal individuals to create diverse abnormal aggregation situations.
%在此空间结构上,固定式摄像头布设于各道路交叉口及沿线重点路段,视野覆盖 4×4 格栅10m×10m实时统计覆盖区内个体数生成时空流量数据。正常行人蓝色随机出现在路边随机指定道路终点或建筑入口为目标模拟正常行人进行移动异常行人红色同样生成于路边目标为预设聚集终点在路径决策中对主干道、次干道与支路的选择概率分别设为 0.7、0.2 和 0.1,并引入一定的高斯噪声以模拟不规则行进。随着仿真推进,异常行人逐步在聚集终点汇集,形成高密度聚集效应。
%该仿真程序通过精细的栅格划分、多层次道路—建筑布局及清晰的行人运动规则,生成可控的正常与异常人群数据。仿真输出的摄像头时序流量和聚集点密度曲线可直接作为时空图卷积网络模型的训练与验证数据集方便后续的研究。
%为进一步验证所提出加权异常聚集度指标在多类群体行为场景下的适应性与有效性,我们在仿真平台上模拟了三种典型的异常人群行为:偶发群体行为、示威游行和城市骚乱。在每种场景下,均设置有正常背景人流,同时注入异常个体并控制其空间和时序分布规律,以构建多样化的异常聚集态势。
%在各类行为模拟中我们设计了三种输入策略以评估不同信息源对异常聚集点预测的影响。第一种策略“BaselineNormal”仅使用正常人流NnormalN_{\mathrm{normal}}Nnormal第二种策略“BaselineAnomaly”仅使用异常人流 NanomalyN_{\mathrm{anomaly}}Nanomaly第三种策略则采用所提出的加权异常聚集度融合正常与异常人流。所有输入均送入相同的模型以保证比较的公平性。预测得分最高的x个聚集点中包含真实异常点的比例记为Hit Rate@X。实验结果如下表所示
In our simulations of various behaviors, we designed three input strategies to evaluate the impact of different information sources on the prediction of abnormal gathering points. The first strategy ("BaselineNormal") uses only normal pedestrian flow \( N_{\mathrm{normal}} \); the second strategy ("BaselineAnomaly") uses only abnormal pedestrian flow \( N_{\mathrm{anomaly}} \); and the third strategy employs the proposed weighted abnormal aggregation index, integrating both normal and abnormal flows. All inputs are fed into the same model to ensure a fair comparison.
The proportion of true abnormal points among the top-\( x \) predicted gathering points is recorded as Hit Rate@\( x \). The experimental results are presented in the table below:
% To slightly widen all three tables, you can increase the column separation:
\setlength{\tabcolsep}{6pt} % default is 4pt, adjust as needed
% Table 1: Incidental Crowd Scenario Hit Rate Comparison
\begin{table}[ht]
\centering
\caption{Incidental Crowd Scenario Hit Rate Comparison}
\label{tab:incidental_hit_rate}
\begin{tabular}{lccc}
\toprule
Strategy & Hit Rate@1 & Hit Rate@3 & Hit Rate@5 \\
\midrule
BaselineNormal & 0.12 & 0.30 & 0.42 \\
BaselineAnomaly & 0.22 & 0.45 & 0.58 \\
OursWeighted & \textbf{0.35} & \textbf{0.62} & \textbf{0.73} \\
\bottomrule
\end{tabular}
\end{table}
% Table 2: Demonstration Scenario Hit Rate Comparison
\begin{table}[ht]
\centering
\caption{Demonstration Scenario Hit Rate Comparison}
\label{tab:demonstration_hit_rate}
\begin{tabular}{lccc}
\toprule
Strategy & Hit Rate@1 & Hit Rate@3 & Hit Rate@5 \\
\midrule
BaselineNormal & 0.10 & 0.28 & 0.40 \\
BaselineAnomaly & 0.20 & 0.38 & 0.56 \\
OursWeighted & \textbf{0.32} & \textbf{0.58} & \textbf{0.71} \\
\bottomrule
\end{tabular}
\end{table}
In addition, as indicated by the Hit Rate@1 and Hit Rate@3 metrics, the weighted strategy demonstrates clear advantages in both precise localization (Hit Rate@1) and candidate set coverage (Hit Rate@3). Across the three scenarios, Hit Rate@1 improves by an average of approximately 0.12, while Hit Rate@3 shows an average improvement of around 0.17.
These results suggest that the proposed weighted abnormal aggregation degree, which integrates both normal and abnormal pedestrian flows, can more accurately and reliably capture spatial hotspots of various sudden gathering events. Consequently, it effectively enhances both the success rate and robustness of gathering point prediction.
% Table 3: Urban Riot Scenario Hit Rate Comparison
\begin{table}[ht]
\centering
\caption{Urban Riot Scenario Hit Rate Comparison}
\label{tab:riot_hit_rate}
\begin{tabular}{lccc}
\toprule
Strategy & Hit Rate@1 & Hit Rate@3 & Hit Rate@5 \\
\midrule
BaselineNormal & 0.08 & 0.25 & 0.38 \\
BaselineAnomaly & 0.18 & 0.35 & 0.54 \\
OursWeighted & \textbf{0.30} & \textbf{0.55} & \textbf{0.69} \\
\bottomrule
\end{tabular}
\end{table}
%实验结果如表13 所示“OursWeighted” 输入策略在三种异常场景下均显著优于两种基线策略。在偶发群体行为场景中1“OursWeighted” 的 HitRate@5 达到 0.73,较 “BaselineAnomaly” 提升 0.15、较 “BaselineNormal” 提升 0.31在示威游行场景中2“OursWeighted” 的 HitRate@5 为 0.71分别较“BaselineAnomaly”“BaselineNormal” 提升 0.15 和 0.31在城市骚乱场景中3“OursWeighted” 的 HitRate@5 为 0.69较“BaselineAnomaly”提升 0.15、较“BaselineNormal”提升 0.31。
%此外,从 HitRate@1 和 HitRate@3 指标可以看出加权策略在精准定位HitRate@1和候选集覆盖HitRate@3方面均有明显优势三种场景下HitRate@1 平均提升约 0.12HitRate@3 平均提升约 0.17。上述结果表明,融合正常与异常人流的加权异常聚集度能够更准确、更稳定地捕捉各类突发聚集事件的空间热点,从而有效提升聚集点预测的成功率与可靠性。
\section{ Conclusion. }
This paper addresses the challenge of predicting sudden crowd gathering events in urban road networks by proposing a spatio-temporal graph convolutional framework based on weighted abnormal aggregation degree. In terms of method design, it innovatively introduces a weighted fusion strategy of normal and abnormal pedestrian flows, achieving precise characterization of potential gathering points through comprehensive modeling of the intensities of both types of pedestrian flows. Meanwhile, it combines a high-precision regular grid simulation environment to generate multi-scenario and multi-type normal and abnormal pedestrian data, providing reliable support for model training and evaluation.In the experimental verification, we compared the performance of gathering point prediction under three input strategies - using only normal pedestrian flow, using only abnormal pedestrian flow, and the weighted abnormal aggregation degree proposed in this paper - for three typical abnormal scenarios: occasional group behavior, demonstrations, and urban riots. The results show that in key metrics such as Hit Rate@5, @3, and @1, Ours-Weighted significantly outperforms the two baseline strategies.
The above experimental results fully demonstrate the advantage of the weighted fusion strategy in capturing spatial hotspots of sudden gathering events. At the same time, the multi-type behavior samples generated on the simulation platform provide rich test scenarios and reference data for subsequent research.
\section*{Funding Statement}
This work was sponsored by Natural Science Foundation on scientific and technological projects on Kashgar (KS2024024).
%\section*{}
%\textbf{Funding Statement: }This work was sponsored by Natural Science Foundation on scientific and technological projects on Kashgar (KS2024024).
\begin{thebibliography}{30}
\bibitem {MultiCameraReview}
Amosa, Temitope Ibrahim, et al.: Multi-camera multi-object tracking: A review of current trends and future advances. In: Neurocomputing 552 (2023)
\bibitem {DeepLearningMultiCam}
Peri, Neehar, et al.: Towards real-time systems for vehicle re-identification, multi-camera tracking, and anomaly detection. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops (2020)
\bibitem {TopologyAwareMCN}
Mali, Goutam, and Sudip Misra.: Topology management-based distributed camera actuation in wireless multimedia sensor networks. In: ACM Transactions on Autonomous and Adaptive Systems (TAAS) 12.1 (2017)
\bibitem {LearningSpatialRelations}
Shi, Wen, Yongming Huang, and Guobao Zhang.: Dynamic weight-based granular representation of time series and its application in collective anomaly detection. In: Computers and Electrical Engineering 117 (2024)
\bibitem {NonlinearWeightingAnomaly}
Fradi, Hajer, and Jean-Luc Dugelay.: Towards crowd density-aware video surveillance applications. In: Information Fusion 24 (2015)
\bibitem {SaturationSuppression}
Fei, Lunlin, and Bing Han.: Multi-object multi-camera tracking based on deep learning for intelligent transportation: A review. In: Sensors 23.8 (2023)
\bibitem{STGCNTraffic}
Yu, Bing, Haoteng Yin, and Zhanxing Zhu.: Spatio-temporal graph convolutional networks: A deep learning framework for traffic forecasting. arxiv preprint arxiv:1709.04875 (2017)
\bibitem{GNNReview}
Wu, Zonghan, et al.: A comprehensive survey on graph neural networks. IEEE transactions on neural networks and learning systems 32.1 (2020)
\bibitem{GAT}
Veličković, Petar, et al.: Graph attention networks. In: arxiv preprint arxiv:1710.10903 (2017)
\bibitem{GraphSAGE}
Liu, Jielun, Ghim Ong, and qun Chen.: GraphSAGE-based traffic speed forecasting for segment network with sparse data. In: IEEE Transactions on Intelligent Transportation Systems 23.3 (2020)
\bibitem{KipfWelling}
Kipf, Thomas N., and Max Welling.: Semi-supervised classification with graph convolutional networks. In: arxiv preprint arxiv:1609.02907 (2016)
\bibitem{TimeGNN}
Xu, Nancy, Chrysoula Kosma, and Michalis Vazirgiannis.: TimeGNN: temporal dynamic graph learning for time series forecasting. In: International Conference on Complex Networks and Their Applications. Cham: Springer Nature Switzerland (2023)
\bibitem{StemGNN}
Cao, Defu, et al.: Spectral temporal graph neural network for multivariate time-series forecasting. In: Advances in neural information processing systems 33 (2020)
\bibitem{DyGraphformer}
Han, Shuo, et al.: DyGraphformer: Transformer combining dynamic spatio-temporal graph network for multivariate time series forecasting. In: Neural Networks 181 (2025)
\bibitem{H-STGCN}
Dai, Rui, et al.: Hybrid spatio-temporal graph convolutional network: Improving traffic prediction with navigation data. In: Proceedings of the 26th acm sigkdd international conference on knowledge discovery \& data mining (2020).
\bibitem{STS-GCN}
Sofianos, Theodoros, et al.: Space-time-separable graph convolutional network for pose forecasting. In: Proceedings of the IEEE/CVF international conference on computer vision (2021)
\bibitem{GCNInformer}
Wang, HaiKun, et al.: GCNInformer: A combined deep learning model based on GCN and Informer for wind power forecasting. In: Energy Science \& Engineering 11.10 (2023)
\bibitem{GRAST-Frost}
Lira, Hernan, Luis Martí, and Nayat Sanchez-Pi.: A graph neural network with spatio-temporal attention for multi-sources time series data: An application to frost forecast. In: Sensors 22.4 (2022)
\bibitem{Stagcn}
Gu, Yafeng, and Li Deng.: Stagcn: Spatialtemporal attention graph convolution network for traffic forecasting. In: Mathematics 10.9 (2022)
\bibitem{AT}
Adenekan, Tobiloba Kollawole.: Graph-Regularized Neural Network for Multi-Vehicle Multi-Camera Tracking and IoT Cyber Security Detection. In: (2024)
\bibitem{AG}
Cao, Congqi, et al.: Adaptive graph convolutional networks for weakly supervised anomaly detection in videos. In: IEEE Signal Processing Letters 29 (2022)
\bibitem{AGL}
Chiranjeevi, V. Rahul, and D. Malathi.: Anomaly graph: leveraging dynamic graph convolutional networks for enhanced video anomaly detection in surveillance and security applications. In: Neural Computing and Applications 36.20 (2024)
\bibitem{AH}
Zeng X, Jiang Y, Ding W, et al.: A hierarchical spatio-temporal graph convolutional neural network for anomaly detection in videos. In: IEEE Transactions on Circuits and Systems for Video Technology (2021)
\bibitem{MS}
Nakamura, Ikuo.: Multi-Scale Spatial-Temporal Self-Attention Graph Convolutional Networks for Skeleton-based Action Recognition. In: arxiv preprint arxiv:2404.02624 (2024)
\bibitem{GG}
Yang, Aitao, et al.: GTFN: GCN and transformer fusion network with spatial-spectral features for hyperspectral image classification. In: IEEE Transactions on Geoscience and Remote Sensing 61 (2023)
\bibitem{DD}
Zhang, Lei, et al.: Drgcn: Dynamic evolving initial residual for deep graph convolutional networks. In: Proceedings of the AAAI conference on artificial intelligence. Vol. 37. No. 9 (2023)
\bibitem{IG}
Nie, Weizhi, et al.: I-GCN: Incremental graph convolution network for conversation emotion detection. In: IEEE Transactions on Multimedia 24 (2021)
\bibitem{DG}
Qiao, Hezhe, et al.: Deep graph anomaly detection: A survey and new perspectives. In: arxiv preprint arxiv:2409.09957 (2024)
\bibitem{MD}
Deng, Leyan, et al.: Markov-driven graph convolutional networks for social spammer detection. In: IEEE Transactions on Knowledge and Data Engineering 35.12 (2022)
\bibitem{MO}
Wang, Haiyuan, et al.: MO-GCN: A multi-omics graph convolutional network for discriminative analysis of schizophrenia. In: Brain Research Bulletin 221 (2025)
%
%\bibitem {may:ehr:stein}
%May, P., Ehrlich, H.-C., Steinke, T.: ZIB structure prediction pipeline:
%composing a complex biological workflow through web services.
%In: Nagel, W.E., Walter, W.V., Lehner, W. (eds.) Euro-Par 2006.
%LNCS, vol. 4128, pp. 1148?1158. Springer, Heidelberg (2006).
%\url{doi:10.1007/11823285_121}
%
%\bibitem {fost:kes}
%Foster, I., Kesselman, C.: The Grid: Blueprint for a New Computing Infrastructure.
%Morgan Kaufmann, San Francisco (1999)
%
%\bibitem {czaj:fitz}
%Czajkowski, K., Fitzgerald, S., Foster, I., Kesselman, C.: Grid information services
%for distributed resource sharing. In: 10th IEEE International Symposium
%on High Performance Distributed Computing, pp. 181?184. IEEE Press, New York (2001).
%\url{doi: 10.1109/HPDC.2001.945188}
%
%\bibitem {fo:kes:nic:tue}
%Foster, I., Kesselman, C., Nick, J., Tuecke, S.: The physiology of the grid: an open grid services architecture for distributed systems integration. Technical report, Global Grid
%Forum (2002)
%
%\bibitem {onlyurl}
%National Center for Biotechnology Information. \url{http://www.ncbi.nlm.nih.gov}
\end{thebibliography}
\end{document}